You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

re_cache.c 65KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "libmime/message.h"
  17. #include "re_cache.h"
  18. #include "cryptobox.h"
  19. #include "ref.h"
  20. #include "libserver/url.h"
  21. #include "libserver/task.h"
  22. #include "libserver/cfg_file.h"
  23. #include "libutil/util.h"
  24. #include "libutil/regexp.h"
  25. #include "lua/lua_common.h"
  26. #include "libstat/stat_api.h"
  27. #include "contrib/uthash/utlist.h"
  28. #include "khash.h"
  29. #ifdef WITH_HYPERSCAN
  30. #include "hs.h"
  31. #endif
  32. #include "unix-std.h"
  33. #include <signal.h>
  34. #include <stdalign.h>
  35. #include <math.h>
  36. #include "contrib/libev/ev.h"
  37. #ifndef WITH_PCRE2
  38. #include <pcre.h>
  39. #else
  40. #include <pcre2.h>
  41. #endif
  42. #include "contrib/fastutf8/fastutf8.h"
  43. #ifdef HAVE_SYS_WAIT_H
  44. #include <sys/wait.h>
  45. #endif
  46. #define msg_err_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
  47. "re_cache", cache->hash, \
  48. G_STRFUNC, \
  49. __VA_ARGS__)
  50. #define msg_warn_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
  51. "re_cache", cache->hash, \
  52. G_STRFUNC, \
  53. __VA_ARGS__)
  54. #define msg_info_re_cache(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \
  55. "re_cache", cache->hash, \
  56. G_STRFUNC, \
  57. __VA_ARGS__)
  58. #define msg_debug_re_task(...) rspamd_conditional_debug_fast (NULL, NULL, \
  59. rspamd_re_cache_log_id, "re_cache", task->task_pool->tag.uid, \
  60. G_STRFUNC, \
  61. __VA_ARGS__)
  62. #define msg_debug_re_cache(...) rspamd_conditional_debug_fast (NULL, NULL, \
  63. rspamd_re_cache_log_id, "re_cache", cache->hash, \
  64. G_STRFUNC, \
  65. __VA_ARGS__)
  66. INIT_LOG_MODULE(re_cache)
  67. #ifdef WITH_HYPERSCAN
  68. #define RSPAMD_HS_MAGIC_LEN (sizeof (rspamd_hs_magic))
  69. static const guchar rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'},
  70. rspamd_hs_magic_vector[] = {'r', 's', 'h', 's', 'r', 'v', '1', '1'};
  71. #endif
  72. struct rspamd_re_class {
  73. guint64 id;
  74. enum rspamd_re_type type;
  75. gboolean has_utf8; /* if there are any utf8 regexps */
  76. gpointer type_data;
  77. gsize type_len;
  78. GHashTable *re;
  79. rspamd_cryptobox_hash_state_t *st;
  80. gchar hash[rspamd_cryptobox_HASHBYTES + 1];
  81. #ifdef WITH_HYPERSCAN
  82. hs_database_t *hs_db;
  83. hs_scratch_t *hs_scratch;
  84. gint *hs_ids;
  85. guint nhs;
  86. #endif
  87. };
  88. enum rspamd_re_cache_elt_match_type {
  89. RSPAMD_RE_CACHE_PCRE = 0,
  90. RSPAMD_RE_CACHE_HYPERSCAN,
  91. RSPAMD_RE_CACHE_HYPERSCAN_PRE
  92. };
  93. struct rspamd_re_cache_elt {
  94. rspamd_regexp_t *re;
  95. gint lua_cbref;
  96. enum rspamd_re_cache_elt_match_type match_type;
  97. };
  98. KHASH_INIT (lua_selectors_hash, gchar *, int, 1, kh_str_hash_func, kh_str_hash_equal);
  99. struct rspamd_re_cache {
  100. GHashTable *re_classes;
  101. GPtrArray *re;
  102. khash_t (lua_selectors_hash) *selectors;
  103. ref_entry_t ref;
  104. guint nre;
  105. guint max_re_data;
  106. gchar hash[rspamd_cryptobox_HASHBYTES + 1];
  107. lua_State *L;
  108. #ifdef WITH_HYPERSCAN
  109. enum rspamd_hyperscan_status hyperscan_loaded;
  110. gboolean disable_hyperscan;
  111. gboolean vectorized_hyperscan;
  112. hs_platform_info_t plt;
  113. #endif
  114. };
  115. struct rspamd_re_selector_result {
  116. guchar **scvec;
  117. guint *lenvec;
  118. guint cnt;
  119. };
  120. KHASH_INIT (selectors_results_hash, int, struct rspamd_re_selector_result, 1,
  121. kh_int_hash_func, kh_int_hash_equal);
  122. struct rspamd_re_runtime {
  123. guchar *checked;
  124. guchar *results;
  125. khash_t (selectors_results_hash) *sel_cache;
  126. struct rspamd_re_cache *cache;
  127. struct rspamd_re_cache_stat stat;
  128. gboolean has_hs;
  129. };
  130. static GQuark
  131. rspamd_re_cache_quark (void)
  132. {
  133. return g_quark_from_static_string ("re_cache");
  134. }
  135. static guint64
  136. rspamd_re_cache_class_id (enum rspamd_re_type type,
  137. gconstpointer type_data,
  138. gsize datalen)
  139. {
  140. rspamd_cryptobox_fast_hash_state_t st;
  141. rspamd_cryptobox_fast_hash_init (&st, 0xdeadbabe);
  142. rspamd_cryptobox_fast_hash_update (&st, &type, sizeof (type));
  143. if (datalen > 0) {
  144. rspamd_cryptobox_fast_hash_update (&st, type_data, datalen);
  145. }
  146. return rspamd_cryptobox_fast_hash_final (&st);
  147. }
  148. static void
  149. rspamd_re_cache_destroy (struct rspamd_re_cache *cache)
  150. {
  151. GHashTableIter it;
  152. gpointer k, v;
  153. struct rspamd_re_class *re_class;
  154. gchar *skey;
  155. gint sref;
  156. g_assert (cache != NULL);
  157. g_hash_table_iter_init (&it, cache->re_classes);
  158. while (g_hash_table_iter_next (&it, &k, &v)) {
  159. re_class = v;
  160. g_hash_table_iter_steal (&it);
  161. g_hash_table_unref (re_class->re);
  162. if (re_class->type_data) {
  163. g_free (re_class->type_data);
  164. }
  165. #ifdef WITH_HYPERSCAN
  166. if (re_class->hs_db) {
  167. hs_free_database (re_class->hs_db);
  168. }
  169. if (re_class->hs_scratch) {
  170. hs_free_scratch (re_class->hs_scratch);
  171. }
  172. if (re_class->hs_ids) {
  173. g_free (re_class->hs_ids);
  174. }
  175. #endif
  176. g_free (re_class);
  177. }
  178. if (cache->L) {
  179. kh_foreach (cache->selectors, skey, sref, {
  180. luaL_unref (cache->L, LUA_REGISTRYINDEX, sref);
  181. g_free (skey);
  182. });
  183. struct rspamd_re_cache_elt *elt;
  184. guint i;
  185. PTR_ARRAY_FOREACH (cache->re, i, elt) {
  186. if (elt->lua_cbref != -1) {
  187. luaL_unref (cache->L, LUA_REGISTRYINDEX, elt->lua_cbref);
  188. }
  189. }
  190. }
  191. kh_destroy (lua_selectors_hash, cache->selectors);
  192. g_hash_table_unref (cache->re_classes);
  193. g_ptr_array_free (cache->re, TRUE);
  194. g_free (cache);
  195. }
  196. static void
  197. rspamd_re_cache_elt_dtor (gpointer e)
  198. {
  199. struct rspamd_re_cache_elt *elt = e;
  200. rspamd_regexp_unref (elt->re);
  201. g_free (elt);
  202. }
  203. struct rspamd_re_cache *
  204. rspamd_re_cache_new (void)
  205. {
  206. struct rspamd_re_cache *cache;
  207. cache = g_malloc0 (sizeof (*cache));
  208. cache->re_classes = g_hash_table_new (g_int64_hash, g_int64_equal);
  209. cache->nre = 0;
  210. cache->re = g_ptr_array_new_full (256, rspamd_re_cache_elt_dtor);
  211. cache->selectors = kh_init (lua_selectors_hash);
  212. #ifdef WITH_HYPERSCAN
  213. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_UNKNOWN;
  214. #endif
  215. REF_INIT_RETAIN (cache, rspamd_re_cache_destroy);
  216. return cache;
  217. }
  218. enum rspamd_hyperscan_status
  219. rspamd_re_cache_is_hs_loaded (struct rspamd_re_cache *cache)
  220. {
  221. g_assert (cache != NULL);
  222. #ifdef WITH_HYPERSCAN
  223. return cache->hyperscan_loaded;
  224. #else
  225. return RSPAMD_HYPERSCAN_UNSUPPORTED;
  226. #endif
  227. }
  228. rspamd_regexp_t *
  229. rspamd_re_cache_add (struct rspamd_re_cache *cache,
  230. rspamd_regexp_t *re,
  231. enum rspamd_re_type type,
  232. gconstpointer type_data, gsize datalen,
  233. gint lua_cbref)
  234. {
  235. guint64 class_id;
  236. struct rspamd_re_class *re_class;
  237. rspamd_regexp_t *nre;
  238. struct rspamd_re_cache_elt *elt;
  239. g_assert (cache != NULL);
  240. g_assert (re != NULL);
  241. class_id = rspamd_re_cache_class_id (type, type_data, datalen);
  242. re_class = g_hash_table_lookup (cache->re_classes, &class_id);
  243. if (re_class == NULL) {
  244. re_class = g_malloc0 (sizeof (*re_class));
  245. re_class->id = class_id;
  246. re_class->type_len = datalen;
  247. re_class->type = type;
  248. re_class->re = g_hash_table_new_full (rspamd_regexp_hash,
  249. rspamd_regexp_equal, NULL, (GDestroyNotify)rspamd_regexp_unref);
  250. if (datalen > 0) {
  251. re_class->type_data = g_malloc0 (datalen);
  252. memcpy (re_class->type_data, type_data, datalen);
  253. }
  254. g_hash_table_insert (cache->re_classes, &re_class->id, re_class);
  255. }
  256. if ((nre = g_hash_table_lookup (re_class->re, rspamd_regexp_get_id (re)))
  257. == NULL) {
  258. /*
  259. * We set re id based on the global position in the cache
  260. */
  261. elt = g_malloc0 (sizeof (*elt));
  262. /* One ref for re_class */
  263. nre = rspamd_regexp_ref (re);
  264. rspamd_regexp_set_cache_id (re, cache->nre++);
  265. /* One ref for cache */
  266. elt->re = rspamd_regexp_ref (re);
  267. g_ptr_array_add (cache->re, elt);
  268. rspamd_regexp_set_class (re, re_class);
  269. elt->lua_cbref = lua_cbref;
  270. g_hash_table_insert (re_class->re, rspamd_regexp_get_id (nre), nre);
  271. }
  272. if (rspamd_regexp_get_flags (re) & RSPAMD_REGEXP_FLAG_UTF) {
  273. re_class->has_utf8 = TRUE;
  274. }
  275. return nre;
  276. }
  277. void
  278. rspamd_re_cache_replace (struct rspamd_re_cache *cache,
  279. rspamd_regexp_t *what,
  280. rspamd_regexp_t *with)
  281. {
  282. guint64 re_id;
  283. struct rspamd_re_class *re_class;
  284. rspamd_regexp_t *src;
  285. struct rspamd_re_cache_elt *elt;
  286. g_assert (cache != NULL);
  287. g_assert (what != NULL);
  288. g_assert (with != NULL);
  289. re_class = rspamd_regexp_get_class (what);
  290. if (re_class != NULL) {
  291. re_id = rspamd_regexp_get_cache_id (what);
  292. g_assert (re_id != RSPAMD_INVALID_ID);
  293. src = g_hash_table_lookup (re_class->re, rspamd_regexp_get_id (what));
  294. elt = g_ptr_array_index (cache->re, re_id);
  295. g_assert (elt != NULL);
  296. g_assert (src != NULL);
  297. rspamd_regexp_set_cache_id (what, RSPAMD_INVALID_ID);
  298. rspamd_regexp_set_class (what, NULL);
  299. rspamd_regexp_set_cache_id (with, re_id);
  300. rspamd_regexp_set_class (with, re_class);
  301. /*
  302. * On calling of this function, we actually unref old re (what)
  303. */
  304. g_hash_table_insert (re_class->re,
  305. rspamd_regexp_get_id (what),
  306. rspamd_regexp_ref (with));
  307. rspamd_regexp_unref (elt->re);
  308. elt->re = rspamd_regexp_ref (with);
  309. /* XXX: do not touch match type here */
  310. }
  311. }
  312. static gint
  313. rspamd_re_cache_sort_func (gconstpointer a, gconstpointer b)
  314. {
  315. struct rspamd_re_cache_elt * const *re1 = a, * const *re2 = b;
  316. return rspamd_regexp_cmp (rspamd_regexp_get_id ((*re1)->re),
  317. rspamd_regexp_get_id ((*re2)->re));
  318. }
  319. void
  320. rspamd_re_cache_init (struct rspamd_re_cache *cache, struct rspamd_config *cfg)
  321. {
  322. guint i, fl;
  323. GHashTableIter it;
  324. gpointer k, v;
  325. struct rspamd_re_class *re_class;
  326. rspamd_cryptobox_hash_state_t st_global;
  327. rspamd_regexp_t *re;
  328. struct rspamd_re_cache_elt *elt;
  329. guchar hash_out[rspamd_cryptobox_HASHBYTES];
  330. g_assert (cache != NULL);
  331. rspamd_cryptobox_hash_init (&st_global, NULL, 0);
  332. /* Resort all regexps */
  333. g_ptr_array_sort (cache->re, rspamd_re_cache_sort_func);
  334. for (i = 0; i < cache->re->len; i ++) {
  335. elt = g_ptr_array_index (cache->re, i);
  336. re = elt->re;
  337. re_class = rspamd_regexp_get_class (re);
  338. g_assert (re_class != NULL);
  339. rspamd_regexp_set_cache_id (re, i);
  340. if (re_class->st == NULL) {
  341. (void) !posix_memalign ((void **)&re_class->st, RSPAMD_ALIGNOF(rspamd_cryptobox_hash_state_t),
  342. sizeof (*re_class->st));
  343. g_assert (re_class->st != NULL);
  344. rspamd_cryptobox_hash_init (re_class->st, NULL, 0);
  345. }
  346. /* Update hashes */
  347. /* Id of re class */
  348. rspamd_cryptobox_hash_update (re_class->st, (gpointer) &re_class->id,
  349. sizeof (re_class->id));
  350. rspamd_cryptobox_hash_update (&st_global, (gpointer) &re_class->id,
  351. sizeof (re_class->id));
  352. /* Id of re expression */
  353. rspamd_cryptobox_hash_update (re_class->st, rspamd_regexp_get_id (re),
  354. rspamd_cryptobox_HASHBYTES);
  355. rspamd_cryptobox_hash_update (&st_global, rspamd_regexp_get_id (re),
  356. rspamd_cryptobox_HASHBYTES);
  357. /* PCRE flags */
  358. fl = rspamd_regexp_get_pcre_flags (re);
  359. rspamd_cryptobox_hash_update (re_class->st, (const guchar *)&fl,
  360. sizeof (fl));
  361. rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
  362. sizeof (fl));
  363. /* Rspamd flags */
  364. fl = rspamd_regexp_get_flags (re);
  365. rspamd_cryptobox_hash_update (re_class->st, (const guchar *) &fl,
  366. sizeof (fl));
  367. rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
  368. sizeof (fl));
  369. /* Limit of hits */
  370. fl = rspamd_regexp_get_maxhits (re);
  371. rspamd_cryptobox_hash_update (re_class->st, (const guchar *) &fl,
  372. sizeof (fl));
  373. rspamd_cryptobox_hash_update (&st_global, (const guchar *) &fl,
  374. sizeof (fl));
  375. /* Numberic order */
  376. rspamd_cryptobox_hash_update (re_class->st, (const guchar *)&i,
  377. sizeof (i));
  378. rspamd_cryptobox_hash_update (&st_global, (const guchar *)&i,
  379. sizeof (i));
  380. }
  381. rspamd_cryptobox_hash_final (&st_global, hash_out);
  382. rspamd_snprintf (cache->hash, sizeof (cache->hash), "%*xs",
  383. (gint) rspamd_cryptobox_HASHBYTES, hash_out);
  384. /* Now finalize all classes */
  385. g_hash_table_iter_init (&it, cache->re_classes);
  386. while (g_hash_table_iter_next (&it, &k, &v)) {
  387. re_class = v;
  388. if (re_class->st) {
  389. /*
  390. * We finally update all classes with the number of expressions
  391. * in the cache to ensure that if even a single re has been changed
  392. * we won't be broken due to id mismatch
  393. */
  394. rspamd_cryptobox_hash_update (re_class->st,
  395. (gpointer)&cache->re->len,
  396. sizeof (cache->re->len));
  397. rspamd_cryptobox_hash_final (re_class->st, hash_out);
  398. rspamd_snprintf (re_class->hash, sizeof (re_class->hash), "%*xs",
  399. (gint) rspamd_cryptobox_HASHBYTES, hash_out);
  400. free (re_class->st); /* Due to posix_memalign */
  401. re_class->st = NULL;
  402. }
  403. }
  404. cache->L = cfg->lua_state;
  405. #ifdef WITH_HYPERSCAN
  406. const gchar *platform = "generic";
  407. rspamd_fstring_t *features = rspamd_fstring_new ();
  408. cache->disable_hyperscan = cfg->disable_hyperscan;
  409. cache->vectorized_hyperscan = cfg->vectorized_hyperscan;
  410. g_assert (hs_populate_platform (&cache->plt) == HS_SUCCESS);
  411. /* Now decode what we do have */
  412. switch (cache->plt.tune) {
  413. case HS_TUNE_FAMILY_HSW:
  414. platform = "haswell";
  415. break;
  416. case HS_TUNE_FAMILY_SNB:
  417. platform = "sandy";
  418. break;
  419. case HS_TUNE_FAMILY_BDW:
  420. platform = "broadwell";
  421. break;
  422. case HS_TUNE_FAMILY_IVB:
  423. platform = "ivy";
  424. break;
  425. default:
  426. break;
  427. }
  428. if (cache->plt.cpu_features & HS_CPU_FEATURES_AVX2) {
  429. features = rspamd_fstring_append (features, "AVX2", 4);
  430. }
  431. hs_set_allocator (g_malloc, g_free);
  432. msg_info_re_cache ("loaded hyperscan engine with cpu tune '%s' and features '%V'",
  433. platform, features);
  434. rspamd_fstring_free (features);
  435. #endif
  436. }
  437. struct rspamd_re_runtime *
  438. rspamd_re_cache_runtime_new (struct rspamd_re_cache *cache)
  439. {
  440. struct rspamd_re_runtime *rt;
  441. g_assert (cache != NULL);
  442. rt = g_malloc0 (sizeof (*rt) + NBYTES (cache->nre) + cache->nre);
  443. rt->cache = cache;
  444. REF_RETAIN (cache);
  445. rt->checked = ((guchar *)rt) + sizeof (*rt);
  446. rt->results = rt->checked + NBYTES (cache->nre);
  447. rt->stat.regexp_total = cache->nre;
  448. #ifdef WITH_HYPERSCAN
  449. rt->has_hs = cache->hyperscan_loaded;
  450. #endif
  451. return rt;
  452. }
  453. const struct rspamd_re_cache_stat *
  454. rspamd_re_cache_get_stat (struct rspamd_re_runtime *rt)
  455. {
  456. g_assert (rt != NULL);
  457. return &rt->stat;
  458. }
  459. static gboolean
  460. rspamd_re_cache_check_lua_condition (struct rspamd_task *task,
  461. rspamd_regexp_t *re,
  462. const guchar *in, gsize len,
  463. goffset start, goffset end,
  464. gint lua_cbref)
  465. {
  466. lua_State *L = (lua_State *)task->cfg->lua_state;
  467. GError *err = NULL;
  468. struct rspamd_lua_text __attribute__ ((unused)) *t;
  469. gint text_pos;
  470. if (G_LIKELY (lua_cbref == -1)) {
  471. return TRUE;
  472. }
  473. t = lua_new_text (L, in, len, FALSE);
  474. text_pos = lua_gettop (L);
  475. if (!rspamd_lua_universal_pcall (L, lua_cbref,
  476. G_STRLOC, 1, "utii", &err,
  477. "rspamd{task}", task,
  478. text_pos, start, end)) {
  479. msg_warn_task ("cannot call for re_cache_check_lua_condition for re %s: %e",
  480. rspamd_regexp_get_pattern (re), err);
  481. g_error_free (err);
  482. lua_settop (L, text_pos - 1);
  483. return TRUE;
  484. }
  485. gboolean res = lua_toboolean (L, -1);
  486. lua_settop (L, text_pos - 1);
  487. return res;
  488. }
  489. static guint
  490. rspamd_re_cache_process_pcre (struct rspamd_re_runtime *rt,
  491. rspamd_regexp_t *re, struct rspamd_task *task,
  492. const guchar *in, gsize len,
  493. gboolean is_raw,
  494. gint lua_cbref)
  495. {
  496. guint r = 0;
  497. const gchar *start = NULL, *end = NULL;
  498. guint max_hits = rspamd_regexp_get_maxhits (re);
  499. guint64 id = rspamd_regexp_get_cache_id (re);
  500. gdouble t1 = NAN, t2, pr;
  501. const gdouble slow_time = 1e8;
  502. if (in == NULL) {
  503. return rt->results[id];
  504. }
  505. if (len == 0) {
  506. return rt->results[id];
  507. }
  508. if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) {
  509. len = rt->cache->max_re_data;
  510. }
  511. r = rt->results[id];
  512. if (max_hits == 0 || r < max_hits) {
  513. pr = rspamd_random_double_fast ();
  514. if (pr > 0.9) {
  515. t1 = rspamd_get_ticks (TRUE);
  516. }
  517. while (rspamd_regexp_search (re,
  518. in,
  519. len,
  520. &start,
  521. &end,
  522. is_raw,
  523. NULL)) {
  524. if (rspamd_re_cache_check_lua_condition (task, re, in, len,
  525. start - (const gchar *)in, end - (const gchar *)in, lua_cbref)) {
  526. r++;
  527. msg_debug_re_task ("found regexp /%s/, total hits: %d",
  528. rspamd_regexp_get_pattern (re), r);
  529. }
  530. if (max_hits > 0 && r >= max_hits) {
  531. break;
  532. }
  533. }
  534. rt->results[id] += r;
  535. rt->stat.regexp_checked++;
  536. rt->stat.bytes_scanned_pcre += len;
  537. rt->stat.bytes_scanned += len;
  538. if (r > 0) {
  539. rt->stat.regexp_matched += r;
  540. }
  541. if (!isnan (t1)) {
  542. t2 = rspamd_get_ticks (TRUE);
  543. if (t2 - t1 > slow_time) {
  544. rspamd_symcache_enable_profile (task);
  545. msg_info_task ("regexp '%16s' took %.0f ticks to execute",
  546. rspamd_regexp_get_pattern (re), t2 - t1);
  547. }
  548. }
  549. }
  550. return r;
  551. }
  552. #ifdef WITH_HYPERSCAN
  553. struct rspamd_re_hyperscan_cbdata {
  554. struct rspamd_re_runtime *rt;
  555. const guchar **ins;
  556. const guint *lens;
  557. guint count;
  558. rspamd_regexp_t *re;
  559. struct rspamd_task *task;
  560. };
  561. static gint
  562. rspamd_re_cache_hyperscan_cb (unsigned int id,
  563. unsigned long long from,
  564. unsigned long long to,
  565. unsigned int flags,
  566. void *ud)
  567. {
  568. struct rspamd_re_hyperscan_cbdata *cbdata = ud;
  569. struct rspamd_re_runtime *rt;
  570. struct rspamd_re_cache_elt *cache_elt;
  571. guint ret, maxhits, i, processed;
  572. struct rspamd_task *task;
  573. rt = cbdata->rt;
  574. task = cbdata->task;
  575. cache_elt = g_ptr_array_index (rt->cache->re, id);
  576. maxhits = rspamd_regexp_get_maxhits (cache_elt->re);
  577. if (cache_elt->match_type == RSPAMD_RE_CACHE_HYPERSCAN) {
  578. if (rspamd_re_cache_check_lua_condition (task, cache_elt->re,
  579. cbdata->ins[0], cbdata->lens[0], from, to, cache_elt->lua_cbref)) {
  580. ret = 1;
  581. setbit (rt->checked, id);
  582. if (maxhits == 0 || rt->results[id] < maxhits) {
  583. rt->results[id] += ret;
  584. rt->stat.regexp_matched++;
  585. }
  586. msg_debug_re_task ("found regexp /%s/ using hyperscan only, total hits: %d",
  587. rspamd_regexp_get_pattern (cache_elt->re), rt->results[id]);
  588. }
  589. }
  590. else {
  591. if (!isset (rt->checked, id)) {
  592. processed = 0;
  593. for (i = 0; i < cbdata->count; i ++) {
  594. rspamd_re_cache_process_pcre (rt,
  595. cache_elt->re,
  596. cbdata->task,
  597. cbdata->ins[i],
  598. cbdata->lens[i],
  599. FALSE,
  600. cache_elt->lua_cbref);
  601. setbit (rt->checked, id);
  602. processed += cbdata->lens[i];
  603. if (processed >= to) {
  604. break;
  605. }
  606. }
  607. }
  608. }
  609. return 0;
  610. }
  611. #endif
  612. static guint
  613. rspamd_re_cache_process_regexp_data (struct rspamd_re_runtime *rt,
  614. rspamd_regexp_t *re, struct rspamd_task *task,
  615. const guchar **in, guint *lens,
  616. guint count,
  617. gboolean is_raw,
  618. gboolean *processed_hyperscan)
  619. {
  620. guint64 re_id;
  621. guint ret = 0;
  622. guint i;
  623. struct rspamd_re_cache_elt *cache_elt;
  624. re_id = rspamd_regexp_get_cache_id (re);
  625. if (count == 0 || in == NULL) {
  626. /* We assume this as absence of the specified data */
  627. setbit (rt->checked, re_id);
  628. rt->results[re_id] = ret;
  629. return ret;
  630. }
  631. cache_elt = (struct rspamd_re_cache_elt *)g_ptr_array_index (rt->cache->re, re_id);
  632. #ifndef WITH_HYPERSCAN
  633. for (i = 0; i < count; i++) {
  634. ret = rspamd_re_cache_process_pcre (rt,
  635. re,
  636. task,
  637. in[i],
  638. lens[i],
  639. is_raw,
  640. cache_elt->lua_cbref);
  641. rt->results[re_id] = ret;
  642. }
  643. setbit (rt->checked, re_id);
  644. #else
  645. struct rspamd_re_class *re_class;
  646. struct rspamd_re_hyperscan_cbdata cbdata;
  647. cache_elt = g_ptr_array_index (rt->cache->re, re_id);
  648. re_class = rspamd_regexp_get_class (re);
  649. if (rt->cache->disable_hyperscan || cache_elt->match_type == RSPAMD_RE_CACHE_PCRE ||
  650. !rt->has_hs || (is_raw && re_class->has_utf8)) {
  651. for (i = 0; i < count; i++) {
  652. ret = rspamd_re_cache_process_pcre (rt,
  653. re,
  654. task,
  655. in[i],
  656. lens[i],
  657. is_raw,
  658. cache_elt->lua_cbref);
  659. }
  660. setbit (rt->checked, re_id);
  661. }
  662. else {
  663. for (i = 0; i < count; i ++) {
  664. /* For Hyperscan we can probably safely disable all those limits */
  665. #if 0
  666. if (rt->cache->max_re_data > 0 && lens[i] > rt->cache->max_re_data) {
  667. lens[i] = rt->cache->max_re_data;
  668. }
  669. #endif
  670. rt->stat.bytes_scanned += lens[i];
  671. }
  672. g_assert (re_class->hs_scratch != NULL);
  673. g_assert (re_class->hs_db != NULL);
  674. /* Go through hyperscan API */
  675. if (!rt->cache->vectorized_hyperscan) {
  676. for (i = 0; i < count; i++) {
  677. cbdata.ins = &in[i];
  678. cbdata.re = re;
  679. cbdata.rt = rt;
  680. cbdata.lens = &lens[i];
  681. cbdata.count = 1;
  682. cbdata.task = task;
  683. if ((hs_scan (re_class->hs_db, in[i], lens[i], 0,
  684. re_class->hs_scratch,
  685. rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) {
  686. ret = 0;
  687. }
  688. else {
  689. ret = rt->results[re_id];
  690. *processed_hyperscan = TRUE;
  691. }
  692. }
  693. }
  694. else {
  695. cbdata.ins = in;
  696. cbdata.re = re;
  697. cbdata.rt = rt;
  698. cbdata.lens = lens;
  699. cbdata.count = 1;
  700. cbdata.task = task;
  701. if ((hs_scan_vector (re_class->hs_db, (const char **)in, lens, count, 0,
  702. re_class->hs_scratch,
  703. rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) {
  704. ret = 0;
  705. }
  706. else {
  707. ret = rt->results[re_id];
  708. *processed_hyperscan = TRUE;
  709. }
  710. }
  711. }
  712. #endif
  713. return ret;
  714. }
  715. static void
  716. rspamd_re_cache_finish_class (struct rspamd_task *task,
  717. struct rspamd_re_runtime *rt,
  718. struct rspamd_re_class *re_class,
  719. const gchar *class_name)
  720. {
  721. #ifdef WITH_HYPERSCAN
  722. guint i;
  723. guint64 re_id;
  724. guint found = 0;
  725. /* Set all bits that are not checked and included in hyperscan to 1 */
  726. for (i = 0; i < re_class->nhs; i++) {
  727. re_id = re_class->hs_ids[i];
  728. if (!isset (rt->checked, re_id)) {
  729. g_assert (rt->results[re_id] == 0);
  730. rt->results[re_id] = 0;
  731. setbit (rt->checked, re_id);
  732. }
  733. else {
  734. found ++;
  735. }
  736. }
  737. msg_debug_re_task ("finished hyperscan for class %s; %d "
  738. "matches found; %d hyperscan supported regexps; %d total regexps",
  739. class_name, found, re_class->nhs, (gint)g_hash_table_size (re_class->re));
  740. #endif
  741. }
  742. static gboolean
  743. rspamd_re_cache_process_selector (struct rspamd_task *task,
  744. struct rspamd_re_runtime *rt,
  745. const gchar *name,
  746. guchar ***svec,
  747. guint **lenvec,
  748. guint *n)
  749. {
  750. gint ref;
  751. khiter_t k;
  752. lua_State *L;
  753. gint err_idx, ret;
  754. struct rspamd_task **ptask;
  755. gboolean result = FALSE;
  756. struct rspamd_re_cache *cache = rt->cache;
  757. struct rspamd_re_selector_result *sr;
  758. L = cache->L;
  759. k = kh_get (lua_selectors_hash, cache->selectors, (gchar *)name);
  760. if (k == kh_end (cache->selectors)) {
  761. msg_err_task ("cannot find selector %s, not registered", name);
  762. return FALSE;
  763. }
  764. ref = kh_value (cache->selectors, k);
  765. /* First, search for the cached result */
  766. if (rt->sel_cache) {
  767. k = kh_get (selectors_results_hash, rt->sel_cache, ref);
  768. if (k != kh_end (rt->sel_cache)) {
  769. sr = &kh_value (rt->sel_cache, k);
  770. *svec = sr->scvec;
  771. *lenvec = sr->lenvec;
  772. *n = sr->cnt;
  773. return TRUE;
  774. }
  775. }
  776. else {
  777. rt->sel_cache = kh_init (selectors_results_hash);
  778. }
  779. lua_pushcfunction (L, &rspamd_lua_traceback);
  780. err_idx = lua_gettop (L);
  781. lua_rawgeti (L, LUA_REGISTRYINDEX, ref);
  782. ptask = lua_newuserdata (L, sizeof (*ptask));
  783. *ptask = task;
  784. rspamd_lua_setclass (L, "rspamd{task}", -1);
  785. if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) {
  786. msg_err_task ("call to selector %s "
  787. "failed (%d): %s", name, ret,
  788. lua_tostring (L, -1));
  789. }
  790. else {
  791. struct rspamd_lua_text *txt;
  792. gsize slen;
  793. const gchar *sel_data;
  794. if (lua_type (L, -1) != LUA_TTABLE) {
  795. txt = lua_check_text_or_string (L, -1);
  796. if (txt) {
  797. sel_data = txt->start;
  798. slen = txt->len;
  799. *n = 1;
  800. *svec = g_malloc (sizeof (guchar *));
  801. *lenvec = g_malloc (sizeof (guint));
  802. (*svec)[0] = g_malloc (slen);
  803. memcpy ((*svec)[0], sel_data, slen);
  804. (*lenvec)[0] = slen;
  805. result = TRUE;
  806. }
  807. }
  808. else {
  809. *n = rspamd_lua_table_size (L, -1);
  810. if (*n > 0) {
  811. *svec = g_malloc (sizeof (guchar *) * (*n));
  812. *lenvec = g_malloc (sizeof (guint) * (*n));
  813. for (guint i = 0; i < *n; i ++) {
  814. lua_rawgeti (L, -1, i + 1);
  815. txt = lua_check_text_or_string (L, -1);
  816. if (txt) {
  817. sel_data = txt->start;
  818. slen = txt->len;
  819. }
  820. else {
  821. sel_data = "";
  822. slen = 0;
  823. }
  824. (*svec)[i] = g_malloc (slen);
  825. memcpy ((*svec)[i], sel_data, slen);
  826. (*lenvec)[i] = slen;
  827. lua_pop (L, 1);
  828. }
  829. result = TRUE;
  830. }
  831. }
  832. }
  833. lua_settop (L, err_idx - 1);
  834. if (result) {
  835. k = kh_put (selectors_results_hash, rt->sel_cache, ref, &ret);
  836. sr = &kh_value (rt->sel_cache, k);
  837. sr->cnt = *n;
  838. sr->scvec = *svec;
  839. sr->lenvec = *lenvec;
  840. }
  841. return result;
  842. }
  843. static inline guint
  844. rspamd_process_words_vector (GArray *words,
  845. const guchar **scvec,
  846. guint *lenvec,
  847. struct rspamd_re_class *re_class,
  848. guint cnt,
  849. gboolean *raw)
  850. {
  851. guint j;
  852. rspamd_stat_token_t *tok;
  853. if (words) {
  854. for (j = 0; j < words->len; j ++) {
  855. tok = &g_array_index (words, rspamd_stat_token_t, j);
  856. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
  857. if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
  858. if (!re_class->has_utf8) {
  859. *raw = TRUE;
  860. }
  861. else {
  862. continue; /* Skip */
  863. }
  864. }
  865. }
  866. else {
  867. continue; /* Skip non text */
  868. }
  869. if (re_class->type == RSPAMD_RE_RAWWORDS) {
  870. if (tok->original.len > 0) {
  871. scvec[cnt] = tok->original.begin;
  872. lenvec[cnt++] = tok->original.len;
  873. }
  874. }
  875. else if (re_class->type == RSPAMD_RE_WORDS) {
  876. if (tok->normalized.len > 0) {
  877. scvec[cnt] = tok->normalized.begin;
  878. lenvec[cnt++] = tok->normalized.len;
  879. }
  880. }
  881. else {
  882. /* Stemmed words */
  883. if (tok->stemmed.len > 0) {
  884. scvec[cnt] = tok->stemmed.begin;
  885. lenvec[cnt++] = tok->stemmed.len;
  886. }
  887. }
  888. }
  889. }
  890. return cnt;
  891. }
  892. static guint
  893. rspamd_re_cache_process_headers_list (struct rspamd_task *task,
  894. struct rspamd_re_runtime *rt,
  895. rspamd_regexp_t *re,
  896. struct rspamd_re_class *re_class,
  897. struct rspamd_mime_header *rh,
  898. gboolean is_strong,
  899. gboolean *processed_hyperscan)
  900. {
  901. const guchar **scvec, *in;
  902. gboolean raw = FALSE;
  903. guint *lenvec;
  904. struct rspamd_mime_header *cur;
  905. guint cnt = 0, i = 0, ret = 0;
  906. DL_COUNT (rh, cur, cnt);
  907. scvec = g_malloc (sizeof (*scvec) * cnt);
  908. lenvec = g_malloc (sizeof (*lenvec) * cnt);
  909. DL_FOREACH (rh, cur) {
  910. if (is_strong && strcmp (cur->name, re_class->type_data) != 0) {
  911. /* Skip a different case */
  912. continue;
  913. }
  914. if (re_class->type == RSPAMD_RE_RAWHEADER) {
  915. in = (const guchar *)cur->value;
  916. lenvec[i] = strlen (cur->value);
  917. if (rspamd_fast_utf8_validate (in, lenvec[i]) != 0) {
  918. raw = TRUE;
  919. }
  920. }
  921. else {
  922. in = (const guchar *)cur->decoded;
  923. /* Validate input^W^WNo need to validate as it is already valid */
  924. if (!in) {
  925. lenvec[i] = 0;
  926. scvec[i] = (guchar *)"";
  927. continue;
  928. }
  929. lenvec[i] = strlen (in);
  930. }
  931. scvec[i] = in;
  932. i ++;
  933. }
  934. if (i > 0) {
  935. ret = rspamd_re_cache_process_regexp_data (rt, re,
  936. task, scvec, lenvec, i, raw, processed_hyperscan);
  937. msg_debug_re_task ("checking header %s regexp: %s=%*s -> %d",
  938. re_class->type_data,
  939. rspamd_regexp_get_pattern (re),
  940. (int) lenvec[0], scvec[0], ret);
  941. }
  942. g_free (scvec);
  943. g_free (lenvec);
  944. return ret;
  945. }
  946. /*
  947. * Calculates the specified regexp for the specified class if it's not calculated
  948. */
  949. static guint
  950. rspamd_re_cache_exec_re (struct rspamd_task *task,
  951. struct rspamd_re_runtime *rt,
  952. rspamd_regexp_t *re,
  953. struct rspamd_re_class *re_class,
  954. gboolean is_strong)
  955. {
  956. guint ret = 0, i, re_id;
  957. struct rspamd_mime_header *rh;
  958. const gchar *in;
  959. const guchar **scvec;
  960. guint *lenvec;
  961. gboolean raw = FALSE, processed_hyperscan = FALSE;
  962. struct rspamd_mime_text_part *text_part;
  963. struct rspamd_mime_part *mime_part;
  964. struct rspamd_url *url;
  965. guint len, cnt;
  966. const gchar *class_name;
  967. class_name = rspamd_re_cache_type_to_string (re_class->type);
  968. msg_debug_re_task ("start check re type: %s: /%s/",
  969. class_name,
  970. rspamd_regexp_get_pattern (re));
  971. re_id = rspamd_regexp_get_cache_id (re);
  972. switch (re_class->type) {
  973. case RSPAMD_RE_HEADER:
  974. case RSPAMD_RE_RAWHEADER:
  975. /* Get list of specified headers */
  976. rh = rspamd_message_get_header_array(task,
  977. re_class->type_data, FALSE);
  978. if (rh) {
  979. ret = rspamd_re_cache_process_headers_list (task, rt, re,
  980. re_class, rh, is_strong, &processed_hyperscan);
  981. msg_debug_re_task ("checked header(%s) regexp: %s -> %d",
  982. (const char *)re_class->type_data,
  983. rspamd_regexp_get_pattern (re),
  984. ret);
  985. }
  986. break;
  987. case RSPAMD_RE_ALLHEADER:
  988. raw = TRUE;
  989. in = MESSAGE_FIELD (task, raw_headers_content).begin;
  990. len = MESSAGE_FIELD (task, raw_headers_content).len;
  991. ret = rspamd_re_cache_process_regexp_data (rt, re,
  992. task, (const guchar **)&in, &len, 1, raw, &processed_hyperscan);
  993. msg_debug_re_task ("checked allheader regexp: %s -> %d",
  994. rspamd_regexp_get_pattern (re), ret);
  995. break;
  996. case RSPAMD_RE_MIMEHEADER:
  997. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, mime_part) {
  998. rh = rspamd_message_get_header_from_hash(mime_part->raw_headers,
  999. re_class->type_data, FALSE);
  1000. if (rh) {
  1001. ret += rspamd_re_cache_process_headers_list (task, rt, re,
  1002. re_class, rh, is_strong, &processed_hyperscan);
  1003. }
  1004. msg_debug_re_task ("checked mime header(%s) regexp: %s -> %d",
  1005. (const char *)re_class->type_data,
  1006. rspamd_regexp_get_pattern (re),
  1007. ret);
  1008. }
  1009. break;
  1010. case RSPAMD_RE_MIME:
  1011. case RSPAMD_RE_RAWMIME:
  1012. /* Iterate through text parts */
  1013. if (MESSAGE_FIELD (task, text_parts)->len > 0) {
  1014. cnt = MESSAGE_FIELD (task, text_parts)->len;
  1015. scvec = g_malloc (sizeof (*scvec) * cnt);
  1016. lenvec = g_malloc (sizeof (*lenvec) * cnt);
  1017. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
  1018. /* Select data for regexp */
  1019. if (re_class->type == RSPAMD_RE_RAWMIME) {
  1020. if (text_part->raw.len == 0) {
  1021. len = 0;
  1022. in = "";
  1023. }
  1024. else {
  1025. in = text_part->raw.begin;
  1026. len = text_part->raw.len;
  1027. }
  1028. raw = TRUE;
  1029. }
  1030. else {
  1031. /* Skip empty parts */
  1032. if (IS_TEXT_PART_EMPTY (text_part)) {
  1033. len = 0;
  1034. in = "";
  1035. }
  1036. else {
  1037. /* Check raw flags */
  1038. if (!IS_TEXT_PART_UTF (text_part)) {
  1039. raw = TRUE;
  1040. }
  1041. in = text_part->utf_content.begin;
  1042. len = text_part->utf_content.len;
  1043. }
  1044. }
  1045. scvec[i] = (guchar *) in;
  1046. lenvec[i] = len;
  1047. }
  1048. ret = rspamd_re_cache_process_regexp_data (rt, re,
  1049. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1050. msg_debug_re_task ("checked mime regexp: %s -> %d",
  1051. rspamd_regexp_get_pattern (re), ret);
  1052. g_free (scvec);
  1053. g_free (lenvec);
  1054. }
  1055. break;
  1056. case RSPAMD_RE_URL:
  1057. cnt = kh_size (MESSAGE_FIELD (task, urls));
  1058. if (cnt > 0) {
  1059. scvec = g_malloc (sizeof (*scvec) * cnt);
  1060. lenvec = g_malloc (sizeof (*lenvec) * cnt);
  1061. i = 0;
  1062. raw = FALSE;
  1063. kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
  1064. if ((url->protocol & PROTOCOL_MAILTO)) {
  1065. continue;
  1066. }
  1067. in = url->string;
  1068. len = url->urllen;
  1069. if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
  1070. scvec[i] = (guchar *) in;
  1071. lenvec[i++] = len;
  1072. }
  1073. });
  1074. #if 0
  1075. g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
  1076. while (g_hash_table_iter_next (&it, &k, &v)) {
  1077. url = v;
  1078. in = url->string;
  1079. len = url->urllen;
  1080. if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) {
  1081. scvec[i] = (guchar *) in;
  1082. lenvec[i++] = len;
  1083. }
  1084. }
  1085. #endif
  1086. ret = rspamd_re_cache_process_regexp_data (rt, re,
  1087. task, scvec, lenvec, i, raw, &processed_hyperscan);
  1088. msg_debug_re_task ("checked url regexp: %s -> %d",
  1089. rspamd_regexp_get_pattern (re), ret);
  1090. g_free (scvec);
  1091. g_free (lenvec);
  1092. }
  1093. break;
  1094. case RSPAMD_RE_EMAIL:
  1095. cnt = kh_size (MESSAGE_FIELD (task, urls));
  1096. if (cnt > 0) {
  1097. scvec = g_malloc (sizeof (*scvec) * cnt);
  1098. lenvec = g_malloc (sizeof (*lenvec) * cnt);
  1099. i = 0;
  1100. raw = FALSE;
  1101. kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
  1102. if (!(url->protocol & PROTOCOL_MAILTO)) {
  1103. continue;
  1104. }
  1105. if (url->userlen == 0 || url->hostlen == 0) {
  1106. continue;
  1107. }
  1108. in = rspamd_url_user_unsafe (url);
  1109. len = url->userlen + 1 + url->hostlen;
  1110. scvec[i] = (guchar *) in;
  1111. lenvec[i++] = len;
  1112. });
  1113. ret = rspamd_re_cache_process_regexp_data (rt, re,
  1114. task, scvec, lenvec, i, raw, &processed_hyperscan);
  1115. msg_debug_re_task ("checked email regexp: %s -> %d",
  1116. rspamd_regexp_get_pattern (re), ret);
  1117. g_free (scvec);
  1118. g_free (lenvec);
  1119. }
  1120. break;
  1121. case RSPAMD_RE_BODY:
  1122. raw = TRUE;
  1123. in = task->msg.begin;
  1124. len = task->msg.len;
  1125. ret = rspamd_re_cache_process_regexp_data (rt, re, task,
  1126. (const guchar **)&in, &len, 1, raw, &processed_hyperscan);
  1127. msg_debug_re_task ("checked rawbody regexp: %s -> %d",
  1128. rspamd_regexp_get_pattern (re), ret);
  1129. break;
  1130. case RSPAMD_RE_SABODY:
  1131. /* According to SA docs:
  1132. * The 'body' in this case is the textual parts of the message body;
  1133. * any non-text MIME parts are stripped, and the message decoded from
  1134. * Quoted-Printable or Base-64-encoded format if necessary. The message
  1135. * Subject header is considered part of the body and becomes the first
  1136. * paragraph when running the rules. All HTML tags and line breaks will
  1137. * be removed before matching.
  1138. */
  1139. cnt = MESSAGE_FIELD (task, text_parts)->len + 1;
  1140. scvec = g_malloc (sizeof (*scvec) * cnt);
  1141. lenvec = g_malloc (sizeof (*lenvec) * cnt);
  1142. /*
  1143. * Body rules also include the Subject as the first line
  1144. * of the body content.
  1145. */
  1146. rh = rspamd_message_get_header_array(task, "Subject", FALSE);
  1147. if (rh) {
  1148. scvec[0] = (guchar *)rh->decoded;
  1149. lenvec[0] = strlen (rh->decoded);
  1150. }
  1151. else {
  1152. scvec[0] = (guchar *)"";
  1153. lenvec[0] = 0;
  1154. }
  1155. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
  1156. if (text_part->utf_stripped_content) {
  1157. scvec[i + 1] = (guchar *)text_part->utf_stripped_content->data;
  1158. lenvec[i + 1] = text_part->utf_stripped_content->len;
  1159. if (!IS_TEXT_PART_UTF (text_part)) {
  1160. raw = TRUE;
  1161. }
  1162. }
  1163. else {
  1164. scvec[i + 1] = (guchar *)"";
  1165. lenvec[i + 1] = 0;
  1166. }
  1167. }
  1168. ret = rspamd_re_cache_process_regexp_data (rt, re,
  1169. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1170. msg_debug_re_task ("checked sa body regexp: %s -> %d",
  1171. rspamd_regexp_get_pattern (re), ret);
  1172. g_free (scvec);
  1173. g_free (lenvec);
  1174. break;
  1175. case RSPAMD_RE_SARAWBODY:
  1176. /* According to SA docs:
  1177. * The 'raw body' of a message is the raw data inside all textual
  1178. * parts. The text will be decoded from base64 or quoted-printable
  1179. * encoding, but HTML tags and line breaks will still be present.
  1180. * Multiline expressions will need to be used to match strings that are
  1181. * broken by line breaks.
  1182. */
  1183. if (MESSAGE_FIELD (task, text_parts)->len > 0) {
  1184. cnt = MESSAGE_FIELD (task, text_parts)->len;
  1185. scvec = g_malloc (sizeof (*scvec) * cnt);
  1186. lenvec = g_malloc (sizeof (*lenvec) * cnt);
  1187. for (i = 0; i < cnt; i++) {
  1188. text_part = g_ptr_array_index (MESSAGE_FIELD (task, text_parts), i);
  1189. if (text_part->parsed.len > 0) {
  1190. scvec[i] = (guchar *)text_part->parsed.begin;
  1191. lenvec[i] = text_part->parsed.len;
  1192. if (!IS_TEXT_PART_UTF (text_part)) {
  1193. raw = TRUE;
  1194. }
  1195. }
  1196. else {
  1197. scvec[i] = (guchar *)"";
  1198. lenvec[i] = 0;
  1199. }
  1200. }
  1201. ret = rspamd_re_cache_process_regexp_data (rt, re,
  1202. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1203. msg_debug_re_task ("checked sa rawbody regexp: %s -> %d",
  1204. rspamd_regexp_get_pattern (re), ret);
  1205. g_free (scvec);
  1206. g_free (lenvec);
  1207. }
  1208. break;
  1209. case RSPAMD_RE_WORDS:
  1210. case RSPAMD_RE_STEMWORDS:
  1211. case RSPAMD_RE_RAWWORDS:
  1212. if (MESSAGE_FIELD (task, text_parts)->len > 0) {
  1213. cnt = 0;
  1214. raw = FALSE;
  1215. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
  1216. if (text_part->utf_words) {
  1217. cnt += text_part->utf_words->len;
  1218. }
  1219. }
  1220. if (task->meta_words && task->meta_words->len > 0) {
  1221. cnt += task->meta_words->len;
  1222. }
  1223. if (cnt > 0) {
  1224. scvec = g_malloc (sizeof (*scvec) * cnt);
  1225. lenvec = g_malloc (sizeof (*lenvec) * cnt);
  1226. cnt = 0;
  1227. PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) {
  1228. if (text_part->utf_words) {
  1229. cnt = rspamd_process_words_vector (text_part->utf_words,
  1230. scvec, lenvec, re_class, cnt, &raw);
  1231. }
  1232. }
  1233. if (task->meta_words) {
  1234. cnt = rspamd_process_words_vector (task->meta_words,
  1235. scvec, lenvec, re_class, cnt, &raw);
  1236. }
  1237. ret = rspamd_re_cache_process_regexp_data (rt, re,
  1238. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1239. msg_debug_re_task ("checked sa words regexp: %s -> %d",
  1240. rspamd_regexp_get_pattern (re), ret);
  1241. g_free (scvec);
  1242. g_free (lenvec);
  1243. }
  1244. }
  1245. break;
  1246. case RSPAMD_RE_SELECTOR:
  1247. if (rspamd_re_cache_process_selector (task, rt,
  1248. re_class->type_data,
  1249. (guchar ***)&scvec,
  1250. &lenvec, &cnt)) {
  1251. ret = rspamd_re_cache_process_regexp_data (rt, re,
  1252. task, scvec, lenvec, cnt, raw, &processed_hyperscan);
  1253. msg_debug_re_task ("checked selector(%s) regexp: %s -> %d",
  1254. re_class->type_data,
  1255. rspamd_regexp_get_pattern (re), ret);
  1256. /* Do not free vectors as they are managed by rt->sel_cache */
  1257. }
  1258. break;
  1259. case RSPAMD_RE_MAX:
  1260. msg_err_task ("regexp of class invalid has been called: %s",
  1261. rspamd_regexp_get_pattern (re));
  1262. break;
  1263. }
  1264. #if WITH_HYPERSCAN
  1265. if (processed_hyperscan) {
  1266. rspamd_re_cache_finish_class (task, rt, re_class, class_name);
  1267. }
  1268. #endif
  1269. setbit (rt->checked, re_id);
  1270. return rt->results[re_id];
  1271. }
  1272. gint
  1273. rspamd_re_cache_process (struct rspamd_task *task,
  1274. rspamd_regexp_t *re,
  1275. enum rspamd_re_type type,
  1276. gconstpointer type_data,
  1277. gsize datalen,
  1278. gboolean is_strong)
  1279. {
  1280. guint64 re_id;
  1281. struct rspamd_re_class *re_class;
  1282. struct rspamd_re_cache *cache;
  1283. struct rspamd_re_runtime *rt;
  1284. g_assert (task != NULL);
  1285. rt = task->re_rt;
  1286. g_assert (rt != NULL);
  1287. g_assert (re != NULL);
  1288. cache = rt->cache;
  1289. re_id = rspamd_regexp_get_cache_id (re);
  1290. if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) {
  1291. msg_err_task ("re '%s' has no valid id for the cache",
  1292. rspamd_regexp_get_pattern (re));
  1293. return 0;
  1294. }
  1295. if (isset (rt->checked, re_id)) {
  1296. /* Fast path */
  1297. rt->stat.regexp_fast_cached ++;
  1298. return rt->results[re_id];
  1299. }
  1300. else {
  1301. /* Slow path */
  1302. re_class = rspamd_regexp_get_class (re);
  1303. if (re_class == NULL) {
  1304. msg_err_task ("cannot find re class for regexp '%s'",
  1305. rspamd_regexp_get_pattern (re));
  1306. return 0;
  1307. }
  1308. return rspamd_re_cache_exec_re (task, rt, re, re_class,
  1309. is_strong);
  1310. }
  1311. return 0;
  1312. }
  1313. int
  1314. rspamd_re_cache_process_ffi (void *ptask,
  1315. void *pre,
  1316. int type,
  1317. void *type_data,
  1318. int is_strong)
  1319. {
  1320. struct rspamd_lua_regexp **lua_re = pre;
  1321. struct rspamd_task **real_task = ptask;
  1322. gsize typelen = 0;
  1323. if (type_data) {
  1324. typelen = strlen (type_data);
  1325. }
  1326. return rspamd_re_cache_process (*real_task, (*lua_re)->re,
  1327. type, type_data, typelen, is_strong);
  1328. }
  1329. void
  1330. rspamd_re_cache_runtime_destroy (struct rspamd_re_runtime *rt)
  1331. {
  1332. g_assert (rt != NULL);
  1333. if (rt->sel_cache) {
  1334. struct rspamd_re_selector_result sr;
  1335. kh_foreach_value (rt->sel_cache, sr, {
  1336. for (guint i = 0; i < sr.cnt; i ++) {
  1337. g_free ((gpointer)sr.scvec[i]);
  1338. }
  1339. g_free (sr.scvec);
  1340. g_free (sr.lenvec);
  1341. });
  1342. kh_destroy (selectors_results_hash, rt->sel_cache);
  1343. }
  1344. REF_RELEASE (rt->cache);
  1345. g_free (rt);
  1346. }
  1347. void
  1348. rspamd_re_cache_unref (struct rspamd_re_cache *cache)
  1349. {
  1350. if (cache) {
  1351. REF_RELEASE (cache);
  1352. }
  1353. }
  1354. struct rspamd_re_cache *
  1355. rspamd_re_cache_ref (struct rspamd_re_cache *cache)
  1356. {
  1357. if (cache) {
  1358. REF_RETAIN (cache);
  1359. }
  1360. return cache;
  1361. }
  1362. guint
  1363. rspamd_re_cache_set_limit (struct rspamd_re_cache *cache, guint limit)
  1364. {
  1365. guint old;
  1366. g_assert (cache != NULL);
  1367. old = cache->max_re_data;
  1368. cache->max_re_data = limit;
  1369. return old;
  1370. }
  1371. const gchar *
  1372. rspamd_re_cache_type_to_string (enum rspamd_re_type type)
  1373. {
  1374. const gchar *ret = "unknown";
  1375. switch (type) {
  1376. case RSPAMD_RE_HEADER:
  1377. ret = "header";
  1378. break;
  1379. case RSPAMD_RE_RAWHEADER:
  1380. ret = "raw header";
  1381. break;
  1382. case RSPAMD_RE_MIMEHEADER:
  1383. ret = "mime header";
  1384. break;
  1385. case RSPAMD_RE_ALLHEADER:
  1386. ret = "all headers";
  1387. break;
  1388. case RSPAMD_RE_MIME:
  1389. ret = "part";
  1390. break;
  1391. case RSPAMD_RE_RAWMIME:
  1392. ret = "raw part";
  1393. break;
  1394. case RSPAMD_RE_BODY:
  1395. ret = "rawbody";
  1396. break;
  1397. case RSPAMD_RE_URL:
  1398. ret = "url";
  1399. break;
  1400. case RSPAMD_RE_EMAIL:
  1401. ret = "email";
  1402. break;
  1403. case RSPAMD_RE_SABODY:
  1404. ret = "sa body";
  1405. break;
  1406. case RSPAMD_RE_SARAWBODY:
  1407. ret = "sa raw body";
  1408. break;
  1409. case RSPAMD_RE_SELECTOR:
  1410. ret = "selector";
  1411. break;
  1412. case RSPAMD_RE_WORDS:
  1413. ret = "words";
  1414. break;
  1415. case RSPAMD_RE_RAWWORDS:
  1416. ret = "raw_words";
  1417. break;
  1418. case RSPAMD_RE_STEMWORDS:
  1419. ret = "stem_words";
  1420. break;
  1421. case RSPAMD_RE_MAX:
  1422. default:
  1423. ret = "invalid class";
  1424. break;
  1425. }
  1426. return ret;
  1427. }
  1428. enum rspamd_re_type
  1429. rspamd_re_cache_type_from_string (const char *str)
  1430. {
  1431. enum rspamd_re_type ret;
  1432. guint64 h;
  1433. /*
  1434. * To optimize this function, we apply hash to input string and
  1435. * pre-select it from the values
  1436. */
  1437. if (str != NULL) {
  1438. h = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
  1439. str, strlen (str), 0xdeadbabe);
  1440. switch (h) {
  1441. case G_GUINT64_CONSTANT(0x298b9c8a58887d44): /* header */
  1442. ret = RSPAMD_RE_HEADER;
  1443. break;
  1444. case G_GUINT64_CONSTANT(0x467bfb5cd7ddf890): /* rawheader */
  1445. ret = RSPAMD_RE_RAWHEADER;
  1446. break;
  1447. case G_GUINT64_CONSTANT(0xda081341fb600389): /* mime */
  1448. ret = RSPAMD_RE_MIME;
  1449. break;
  1450. case G_GUINT64_CONSTANT(0xc35831e067a8221d): /* rawmime */
  1451. ret = RSPAMD_RE_RAWMIME;
  1452. break;
  1453. case G_GUINT64_CONSTANT(0xc625e13dbe636de2): /* body */
  1454. case G_GUINT64_CONSTANT(0xCCDEBA43518F721C): /* message */
  1455. ret = RSPAMD_RE_BODY;
  1456. break;
  1457. case G_GUINT64_CONSTANT(0x286edbe164c791d2): /* url */
  1458. case G_GUINT64_CONSTANT(0x7D9ACDF6685661A1): /* uri */
  1459. ret = RSPAMD_RE_URL;
  1460. break;
  1461. case G_GUINT64_CONSTANT (0x7e232b0f60b571be): /* email */
  1462. ret = RSPAMD_RE_EMAIL;
  1463. break;
  1464. case G_GUINT64_CONSTANT(0x796d62205a8778c7): /* allheader */
  1465. ret = RSPAMD_RE_ALLHEADER;
  1466. break;
  1467. case G_GUINT64_CONSTANT(0xa3c6c153b3b00a5e): /* mimeheader */
  1468. ret = RSPAMD_RE_MIMEHEADER;
  1469. break;
  1470. case G_GUINT64_CONSTANT(0x7794501506e604e9): /* sabody */
  1471. ret = RSPAMD_RE_SABODY;
  1472. break;
  1473. case G_GUINT64_CONSTANT(0x28828962E7D2A05F): /* sarawbody */
  1474. ret = RSPAMD_RE_SARAWBODY;
  1475. break;
  1476. default:
  1477. ret = RSPAMD_RE_MAX;
  1478. break;
  1479. }
  1480. }
  1481. else {
  1482. ret = RSPAMD_RE_MAX;
  1483. }
  1484. return ret;
  1485. }
  1486. #ifdef WITH_HYPERSCAN
  1487. static gchar *
  1488. rspamd_re_cache_hs_pattern_from_pcre (rspamd_regexp_t *re)
  1489. {
  1490. /*
  1491. * Workaroung for bug in ragel 7.0.0.11
  1492. * https://github.com/intel/hyperscan/issues/133
  1493. */
  1494. const gchar *pat = rspamd_regexp_get_pattern (re);
  1495. guint flags = rspamd_regexp_get_flags (re), esc_flags = RSPAMD_REGEXP_ESCAPE_RE;
  1496. gchar *escaped;
  1497. gsize esc_len;
  1498. if (flags & RSPAMD_REGEXP_FLAG_UTF) {
  1499. esc_flags |= RSPAMD_REGEXP_ESCAPE_UTF;
  1500. }
  1501. escaped = rspamd_str_regexp_escape (pat, strlen (pat), &esc_len,esc_flags);
  1502. return escaped;
  1503. }
  1504. static gboolean
  1505. rspamd_re_cache_is_finite (struct rspamd_re_cache *cache,
  1506. rspamd_regexp_t *re, gint flags, gdouble max_time)
  1507. {
  1508. pid_t cld;
  1509. gint status;
  1510. struct timespec ts;
  1511. hs_compile_error_t *hs_errors;
  1512. hs_database_t *test_db;
  1513. gdouble wait_time;
  1514. const gint max_tries = 10;
  1515. gint tries = 0, rc;
  1516. void (*old_hdl)(int);
  1517. wait_time = max_time / max_tries;
  1518. /* We need to restore SIGCHLD processing */
  1519. old_hdl = signal (SIGCHLD, SIG_DFL);
  1520. cld = fork ();
  1521. if (cld == 0) {
  1522. /* Try to compile pattern */
  1523. gchar *pat = rspamd_re_cache_hs_pattern_from_pcre (re);
  1524. if (hs_compile (pat,
  1525. flags | HS_FLAG_PREFILTER,
  1526. cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
  1527. &cache->plt,
  1528. &test_db,
  1529. &hs_errors) != HS_SUCCESS) {
  1530. msg_info_re_cache ("cannot compile (prefilter mode) '%s' to hyperscan: '%s'",
  1531. pat,
  1532. hs_errors != NULL ? hs_errors->message : "unknown error");
  1533. hs_free_compile_error (hs_errors);
  1534. g_free (pat);
  1535. exit (EXIT_FAILURE);
  1536. }
  1537. g_free (pat);
  1538. exit (EXIT_SUCCESS);
  1539. }
  1540. else if (cld > 0) {
  1541. double_to_ts (wait_time, &ts);
  1542. while ((rc = waitpid (cld, &status, WNOHANG)) == 0 && tries ++ < max_tries) {
  1543. (void)nanosleep (&ts, NULL);
  1544. }
  1545. /* Child has been terminated */
  1546. if (rc > 0) {
  1547. /* Forget about SIGCHLD after this point */
  1548. signal (SIGCHLD, old_hdl);
  1549. if (WIFEXITED (status) && WEXITSTATUS (status) == EXIT_SUCCESS) {
  1550. return TRUE;
  1551. }
  1552. else {
  1553. msg_err_re_cache (
  1554. "cannot approximate %s to hyperscan",
  1555. rspamd_regexp_get_pattern (re));
  1556. return FALSE;
  1557. }
  1558. }
  1559. else {
  1560. /* We consider that as timeout */
  1561. kill (cld, SIGKILL);
  1562. g_assert (waitpid (cld, &status, 0) != -1);
  1563. msg_err_re_cache (
  1564. "cannot approximate %s to hyperscan: timeout waiting",
  1565. rspamd_regexp_get_pattern (re));
  1566. signal (SIGCHLD, old_hdl);
  1567. }
  1568. }
  1569. else {
  1570. msg_err_re_cache (
  1571. "cannot approximate %s to hyperscan: fork failed: %s",
  1572. rspamd_regexp_get_pattern (re), strerror (errno));
  1573. signal (SIGCHLD, old_hdl);
  1574. }
  1575. return FALSE;
  1576. }
  1577. #endif
  1578. #ifdef WITH_HYPERSCAN
  1579. struct rspamd_re_cache_hs_compile_cbdata {
  1580. GHashTableIter it;
  1581. struct rspamd_re_cache *cache;
  1582. const char *cache_dir;
  1583. gdouble max_time;
  1584. gboolean silent;
  1585. guint total;
  1586. void (*cb)(guint ncompiled, GError *err, void *cbd);
  1587. void *cbd;
  1588. };
  1589. static void
  1590. rspamd_re_cache_compile_err (EV_P_ ev_timer *w, GError *err,
  1591. struct rspamd_re_cache_hs_compile_cbdata *cbdata, bool is_fatal)
  1592. {
  1593. cbdata->cb (cbdata->total, err, cbdata->cbd);
  1594. if (is_fatal) {
  1595. ev_timer_stop(EV_A_ w);
  1596. g_free(w);
  1597. g_free(cbdata);
  1598. }
  1599. else {
  1600. /* Continue compilation */
  1601. ev_timer_again(EV_A_ w);
  1602. }
  1603. g_error_free (err);
  1604. }
  1605. static void
  1606. rspamd_re_cache_compile_timer_cb (EV_P_ ev_timer *w, int revents )
  1607. {
  1608. struct rspamd_re_cache_hs_compile_cbdata *cbdata =
  1609. (struct rspamd_re_cache_hs_compile_cbdata *)w->data;
  1610. GHashTableIter cit;
  1611. gpointer k, v;
  1612. struct rspamd_re_class *re_class;
  1613. gchar path[PATH_MAX], npath[PATH_MAX];
  1614. hs_database_t *test_db;
  1615. gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags;
  1616. rspamd_cryptobox_fast_hash_state_t crc_st;
  1617. guint64 crc;
  1618. rspamd_regexp_t *re;
  1619. hs_compile_error_t *hs_errors = NULL;
  1620. guint *hs_flags = NULL;
  1621. const hs_expr_ext_t **hs_exts = NULL;
  1622. gchar **hs_pats = NULL;
  1623. gchar *hs_serialized = NULL;
  1624. gsize serialized_len;
  1625. struct iovec iov[7];
  1626. struct rspamd_re_cache *cache;
  1627. GError *err;
  1628. pid_t our_pid = getpid ();
  1629. cache = cbdata->cache;
  1630. if (!g_hash_table_iter_next (&cbdata->it, &k, &v)) {
  1631. /* All done */
  1632. ev_timer_stop (EV_A_ w);
  1633. cbdata->cb (cbdata->total, NULL, cbdata->cbd);
  1634. g_free (w);
  1635. g_free (cbdata);
  1636. return;
  1637. }
  1638. re_class = v;
  1639. rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cbdata->cache_dir,
  1640. G_DIR_SEPARATOR, re_class->hash);
  1641. if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, TRUE, TRUE)) {
  1642. fd = open (path, O_RDONLY, 00600);
  1643. /* Read number of regexps */
  1644. g_assert (fd != -1);
  1645. g_assert (lseek (fd, RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt), SEEK_SET) != -1);
  1646. g_assert (read (fd, &n, sizeof (n)) == sizeof (n));
  1647. close (fd);
  1648. if (re_class->type_len > 0) {
  1649. if (!cbdata->silent) {
  1650. msg_info_re_cache (
  1651. "skip already valid class %s(%*s) to cache %6s, %d regexps",
  1652. rspamd_re_cache_type_to_string (re_class->type),
  1653. (gint) re_class->type_len - 1,
  1654. re_class->type_data,
  1655. re_class->hash,
  1656. n);
  1657. }
  1658. }
  1659. else {
  1660. if (!cbdata->silent) {
  1661. msg_info_re_cache (
  1662. "skip already valid class %s to cache %6s, %d regexps",
  1663. rspamd_re_cache_type_to_string (re_class->type),
  1664. re_class->hash,
  1665. n);
  1666. }
  1667. }
  1668. ev_timer_again (EV_A_ w);
  1669. return;
  1670. }
  1671. rspamd_snprintf (path, sizeof (path), "%s%c%s.%P.hs.new", cbdata->cache_dir,
  1672. G_DIR_SEPARATOR, re_class->hash, our_pid);
  1673. fd = open (path, O_CREAT|O_TRUNC|O_EXCL|O_WRONLY, 00600);
  1674. if (fd == -1) {
  1675. err = g_error_new (rspamd_re_cache_quark (), errno,
  1676. "cannot open file %s: %s", path, strerror (errno));
  1677. rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
  1678. return;
  1679. }
  1680. g_hash_table_iter_init (&cit, re_class->re);
  1681. n = g_hash_table_size (re_class->re);
  1682. hs_flags = g_malloc0 (sizeof (*hs_flags) * n);
  1683. hs_ids = g_malloc (sizeof (*hs_ids) * n);
  1684. hs_pats = g_malloc (sizeof (*hs_pats) * n);
  1685. hs_exts = g_malloc0 (sizeof (*hs_exts) * n);
  1686. i = 0;
  1687. while (g_hash_table_iter_next (&cit, &k, &v)) {
  1688. re = v;
  1689. pcre_flags = rspamd_regexp_get_pcre_flags (re);
  1690. re_flags = rspamd_regexp_get_flags (re);
  1691. if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) {
  1692. /* Do not try to compile bad regexp */
  1693. msg_info_re_cache (
  1694. "do not try compile %s to hyperscan as it is PCRE only",
  1695. rspamd_regexp_get_pattern (re));
  1696. continue;
  1697. }
  1698. hs_flags[i] = 0;
  1699. hs_exts[i] = NULL;
  1700. #ifndef WITH_PCRE2
  1701. if (pcre_flags & PCRE_FLAG(UTF8)) {
  1702. hs_flags[i] |= HS_FLAG_UTF8;
  1703. }
  1704. #else
  1705. if (pcre_flags & PCRE_FLAG(UTF)) {
  1706. hs_flags[i] |= HS_FLAG_UTF8;
  1707. }
  1708. #endif
  1709. if (pcre_flags & PCRE_FLAG(CASELESS)) {
  1710. hs_flags[i] |= HS_FLAG_CASELESS;
  1711. }
  1712. if (pcre_flags & PCRE_FLAG(MULTILINE)) {
  1713. hs_flags[i] |= HS_FLAG_MULTILINE;
  1714. }
  1715. if (pcre_flags & PCRE_FLAG(DOTALL)) {
  1716. hs_flags[i] |= HS_FLAG_DOTALL;
  1717. }
  1718. if (re_flags & RSPAMD_REGEXP_FLAG_LEFTMOST) {
  1719. hs_flags[i] |= HS_FLAG_SOM_LEFTMOST;
  1720. }
  1721. else if (rspamd_regexp_get_maxhits (re) == 1) {
  1722. hs_flags[i] |= HS_FLAG_SINGLEMATCH;
  1723. }
  1724. gchar *pat = rspamd_re_cache_hs_pattern_from_pcre (re);
  1725. if (hs_compile (pat,
  1726. hs_flags[i],
  1727. cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
  1728. &cache->plt,
  1729. &test_db,
  1730. &hs_errors) != HS_SUCCESS) {
  1731. msg_info_re_cache ("cannot compile '%s' to hyperscan: '%s', try prefilter match",
  1732. pat,
  1733. hs_errors != NULL ? hs_errors->message : "unknown error");
  1734. hs_free_compile_error (hs_errors);
  1735. /* The approximation operation might take a significant
  1736. * amount of time, so we need to check if it's finite
  1737. */
  1738. if (rspamd_re_cache_is_finite (cache, re, hs_flags[i], cbdata->max_time)) {
  1739. hs_flags[i] |= HS_FLAG_PREFILTER;
  1740. hs_ids[i] = rspamd_regexp_get_cache_id (re);
  1741. hs_pats[i] = pat;
  1742. i++;
  1743. }
  1744. else {
  1745. g_free (pat); /* Avoid leak */
  1746. }
  1747. }
  1748. else {
  1749. hs_ids[i] = rspamd_regexp_get_cache_id (re);
  1750. hs_pats[i] = pat;
  1751. i ++;
  1752. hs_free_database (test_db);
  1753. }
  1754. }
  1755. /* Adjust real re number */
  1756. n = i;
  1757. #define CLEANUP_ALLOCATED(is_err) do { \
  1758. g_free (hs_flags); \
  1759. g_free (hs_ids); \
  1760. for (guint j = 0; j < i; j ++) { \
  1761. g_free (hs_pats[j]); \
  1762. } \
  1763. g_free (hs_pats); \
  1764. g_free (hs_exts); \
  1765. if (is_err) { \
  1766. close (fd); \
  1767. unlink (path); \
  1768. if (hs_errors) hs_free_compile_error (hs_errors); \
  1769. } \
  1770. } while(0)
  1771. if (n > 0) {
  1772. /* Create the hs tree */
  1773. hs_errors = NULL;
  1774. if (hs_compile_ext_multi ((const char **)hs_pats,
  1775. hs_flags,
  1776. hs_ids,
  1777. hs_exts,
  1778. n,
  1779. cache->vectorized_hyperscan ? HS_MODE_VECTORED : HS_MODE_BLOCK,
  1780. &cache->plt,
  1781. &test_db,
  1782. &hs_errors) != HS_SUCCESS) {
  1783. err = g_error_new (rspamd_re_cache_quark (), EINVAL,
  1784. "cannot create tree of regexp when processing '%s': %s",
  1785. hs_pats[hs_errors->expression], hs_errors->message);
  1786. CLEANUP_ALLOCATED(true);
  1787. rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
  1788. return;
  1789. }
  1790. if (hs_serialize_database (test_db, &hs_serialized,
  1791. &serialized_len) != HS_SUCCESS) {
  1792. err = g_error_new (rspamd_re_cache_quark (),
  1793. errno,
  1794. "cannot serialize tree of regexp for %s",
  1795. re_class->hash);
  1796. CLEANUP_ALLOCATED(true);
  1797. hs_free_database (test_db);
  1798. rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
  1799. return;
  1800. }
  1801. hs_free_database (test_db);
  1802. /*
  1803. * Magic - 8 bytes
  1804. * Platform - sizeof (platform)
  1805. * n - number of regexps
  1806. * n * <regexp ids>
  1807. * n * <regexp flags>
  1808. * crc - 8 bytes checksum
  1809. * <hyperscan blob>
  1810. */
  1811. rspamd_cryptobox_fast_hash_init (&crc_st, 0xdeadbabe);
  1812. /* IDs -> Flags -> Hs blob */
  1813. rspamd_cryptobox_fast_hash_update (&crc_st,
  1814. hs_ids, sizeof (*hs_ids) * n);
  1815. rspamd_cryptobox_fast_hash_update (&crc_st,
  1816. hs_flags, sizeof (*hs_flags) * n);
  1817. rspamd_cryptobox_fast_hash_update (&crc_st,
  1818. hs_serialized, serialized_len);
  1819. crc = rspamd_cryptobox_fast_hash_final (&crc_st);
  1820. if (cache->vectorized_hyperscan) {
  1821. iov[0].iov_base = (void *) rspamd_hs_magic_vector;
  1822. }
  1823. else {
  1824. iov[0].iov_base = (void *) rspamd_hs_magic;
  1825. }
  1826. iov[0].iov_len = RSPAMD_HS_MAGIC_LEN;
  1827. iov[1].iov_base = &cache->plt;
  1828. iov[1].iov_len = sizeof (cache->plt);
  1829. iov[2].iov_base = &n;
  1830. iov[2].iov_len = sizeof (n);
  1831. iov[3].iov_base = hs_ids;
  1832. iov[3].iov_len = sizeof (*hs_ids) * n;
  1833. iov[4].iov_base = hs_flags;
  1834. iov[4].iov_len = sizeof (*hs_flags) * n;
  1835. iov[5].iov_base = &crc;
  1836. iov[5].iov_len = sizeof (crc);
  1837. iov[6].iov_base = hs_serialized;
  1838. iov[6].iov_len = serialized_len;
  1839. if (writev (fd, iov, G_N_ELEMENTS (iov)) == -1) {
  1840. err = g_error_new (rspamd_re_cache_quark (),
  1841. errno,
  1842. "cannot serialize tree of regexp to %s: %s",
  1843. path, strerror (errno));
  1844. CLEANUP_ALLOCATED(true);
  1845. g_free (hs_serialized);
  1846. rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
  1847. return;
  1848. }
  1849. if (re_class->type_len > 0) {
  1850. msg_info_re_cache (
  1851. "compiled class %s(%*s) to cache %6s, %d/%d regexps",
  1852. rspamd_re_cache_type_to_string (re_class->type),
  1853. (gint) re_class->type_len - 1,
  1854. re_class->type_data,
  1855. re_class->hash,
  1856. n,
  1857. (gint)g_hash_table_size (re_class->re));
  1858. }
  1859. else {
  1860. msg_info_re_cache (
  1861. "compiled class %s to cache %6s, %d/%d regexps",
  1862. rspamd_re_cache_type_to_string (re_class->type),
  1863. re_class->hash,
  1864. n,
  1865. (gint)g_hash_table_size (re_class->re));
  1866. }
  1867. cbdata->total += n;
  1868. CLEANUP_ALLOCATED(false);
  1869. /* Now rename temporary file to the new .hs file */
  1870. rspamd_snprintf (npath, sizeof (npath), "%s%c%s.hs", cbdata->cache_dir,
  1871. G_DIR_SEPARATOR, re_class->hash);
  1872. if (rename (path, npath) == -1) {
  1873. err = g_error_new (rspamd_re_cache_quark (),
  1874. errno,
  1875. "cannot rename %s to %s: %s",
  1876. path, npath, strerror (errno));
  1877. unlink (path);
  1878. close (fd);
  1879. rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
  1880. return;
  1881. }
  1882. close (fd);
  1883. }
  1884. else {
  1885. err = g_error_new (rspamd_re_cache_quark (),
  1886. errno,
  1887. "no suitable regular expressions %s (%d original): "
  1888. "remove temporary file %s",
  1889. rspamd_re_cache_type_to_string (re_class->type),
  1890. (gint)g_hash_table_size (re_class->re),
  1891. path);
  1892. CLEANUP_ALLOCATED(true);
  1893. rspamd_re_cache_compile_err (EV_A_ w, err, cbdata, false);
  1894. return;
  1895. }
  1896. /* Continue process */
  1897. ev_timer_again (EV_A_ w);
  1898. }
  1899. #endif
  1900. gint
  1901. rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache,
  1902. const char *cache_dir,
  1903. gdouble max_time,
  1904. gboolean silent,
  1905. struct ev_loop *event_loop,
  1906. void (*cb)(guint ncompiled, GError *err, void *cbd),
  1907. void *cbd)
  1908. {
  1909. g_assert (cache != NULL);
  1910. g_assert (cache_dir != NULL);
  1911. #ifndef WITH_HYPERSCAN
  1912. return -1;
  1913. #else
  1914. static ev_timer *timer;
  1915. static const ev_tstamp timer_interval = 0.1;
  1916. struct rspamd_re_cache_hs_compile_cbdata *cbdata;
  1917. cbdata = g_malloc0 (sizeof (*cbdata));
  1918. g_hash_table_iter_init (&cbdata->it, cache->re_classes);
  1919. cbdata->cache = cache;
  1920. cbdata->cache_dir = cache_dir;
  1921. cbdata->cb = cb;
  1922. cbdata->cbd = cbd;
  1923. cbdata->max_time = max_time;
  1924. cbdata->silent = silent;
  1925. cbdata->total = 0;
  1926. timer = g_malloc0 (sizeof (*timer));
  1927. timer->data = (void *)cbdata; /* static */
  1928. ev_timer_init (timer, rspamd_re_cache_compile_timer_cb,
  1929. timer_interval, timer_interval);
  1930. ev_timer_start (event_loop, timer);
  1931. return 0;
  1932. #endif
  1933. }
  1934. gboolean
  1935. rspamd_re_cache_is_valid_hyperscan_file (struct rspamd_re_cache *cache,
  1936. const char *path, gboolean silent, gboolean try_load)
  1937. {
  1938. g_assert (cache != NULL);
  1939. g_assert (path != NULL);
  1940. #ifndef WITH_HYPERSCAN
  1941. return FALSE;
  1942. #else
  1943. gint fd, n, ret;
  1944. guchar magicbuf[RSPAMD_HS_MAGIC_LEN];
  1945. const guchar *mb;
  1946. GHashTableIter it;
  1947. gpointer k, v;
  1948. struct rspamd_re_class *re_class;
  1949. gsize len;
  1950. const gchar *hash_pos;
  1951. hs_platform_info_t test_plt;
  1952. hs_database_t *test_db = NULL;
  1953. guchar *map, *p, *end;
  1954. rspamd_cryptobox_fast_hash_state_t crc_st;
  1955. guint64 crc, valid_crc;
  1956. len = strlen (path);
  1957. if (len < sizeof (rspamd_cryptobox_HASHBYTES + 3)) {
  1958. if (!silent) {
  1959. msg_err_re_cache ("cannot open hyperscan cache file %s: too short filename",
  1960. path);
  1961. }
  1962. return FALSE;
  1963. }
  1964. if (memcmp (path + len - 3, ".hs", 3) != 0) {
  1965. if (!silent) {
  1966. msg_err_re_cache ("cannot open hyperscan cache file %s: not ending with .hs",
  1967. path);
  1968. }
  1969. return FALSE;
  1970. }
  1971. hash_pos = path + len - 3 - (sizeof (re_class->hash) - 1);
  1972. g_hash_table_iter_init (&it, cache->re_classes);
  1973. while (g_hash_table_iter_next (&it, &k, &v)) {
  1974. re_class = v;
  1975. if (memcmp (hash_pos, re_class->hash, sizeof (re_class->hash) - 1) == 0) {
  1976. /* Open file and check magic */
  1977. gssize r;
  1978. fd = open (path, O_RDONLY);
  1979. if (fd == -1) {
  1980. if (errno != ENOENT || !silent) {
  1981. msg_err_re_cache ("cannot open hyperscan cache file %s: %s",
  1982. path, strerror (errno));
  1983. }
  1984. return FALSE;
  1985. }
  1986. if ((r = read (fd, magicbuf, sizeof (magicbuf))) != sizeof (magicbuf)) {
  1987. if (r == -1) {
  1988. msg_err_re_cache ("cannot read magic from hyperscan "
  1989. "cache file %s: %s",
  1990. path, strerror (errno));
  1991. }
  1992. else {
  1993. msg_err_re_cache ("truncated read magic from hyperscan "
  1994. "cache file %s: %z, %z wanted",
  1995. path, r, (gsize)sizeof (magicbuf));
  1996. }
  1997. close (fd);
  1998. return FALSE;
  1999. }
  2000. if (cache->vectorized_hyperscan) {
  2001. mb = rspamd_hs_magic_vector;
  2002. }
  2003. else {
  2004. mb = rspamd_hs_magic;
  2005. }
  2006. if (memcmp (magicbuf, mb, sizeof (magicbuf)) != 0) {
  2007. msg_err_re_cache ("cannot open hyperscan cache file %s: "
  2008. "bad magic ('%*xs', '%*xs' expected)",
  2009. path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf,
  2010. (int) RSPAMD_HS_MAGIC_LEN, mb);
  2011. close (fd);
  2012. return FALSE;
  2013. }
  2014. if ((r = read (fd, &test_plt, sizeof (test_plt))) != sizeof (test_plt)) {
  2015. if (r == -1) {
  2016. msg_err_re_cache ("cannot read platform data from hyperscan "
  2017. "cache file %s: %s",
  2018. path, strerror (errno));
  2019. }
  2020. else {
  2021. msg_err_re_cache ("truncated read platform data from hyperscan "
  2022. "cache file %s: %z, %z wanted",
  2023. path, r, (gsize)sizeof (magicbuf));
  2024. }
  2025. close (fd);
  2026. return FALSE;
  2027. }
  2028. if (memcmp (&test_plt, &cache->plt, sizeof (test_plt)) != 0) {
  2029. msg_err_re_cache ("cannot open hyperscan cache file %s: "
  2030. "compiled for a different platform",
  2031. path);
  2032. close (fd);
  2033. return FALSE;
  2034. }
  2035. close (fd);
  2036. if (try_load) {
  2037. map = rspamd_file_xmap (path, PROT_READ, &len, TRUE);
  2038. if (map == NULL) {
  2039. msg_err_re_cache ("cannot mmap hyperscan cache file %s: "
  2040. "%s",
  2041. path, strerror (errno));
  2042. return FALSE;
  2043. }
  2044. p = map + RSPAMD_HS_MAGIC_LEN + sizeof (test_plt);
  2045. end = map + len;
  2046. n = *(gint *)p;
  2047. p += sizeof (gint);
  2048. if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */
  2049. sizeof (guint64) + /* crc */
  2050. RSPAMD_HS_MAGIC_LEN + /* header */
  2051. sizeof (cache->plt) > len) {
  2052. /* Some wrong amount of regexps */
  2053. msg_err_re_cache ("bad number of expressions in %s: %d",
  2054. path, n);
  2055. munmap (map, len);
  2056. return FALSE;
  2057. }
  2058. /*
  2059. * Magic - 8 bytes
  2060. * Platform - sizeof (platform)
  2061. * n - number of regexps
  2062. * n * <regexp ids>
  2063. * n * <regexp flags>
  2064. * crc - 8 bytes checksum
  2065. * <hyperscan blob>
  2066. */
  2067. memcpy (&crc, p + n * 2 * sizeof (gint), sizeof (crc));
  2068. rspamd_cryptobox_fast_hash_init (&crc_st, 0xdeadbabe);
  2069. /* IDs */
  2070. rspamd_cryptobox_fast_hash_update (&crc_st, p, n * sizeof (gint));
  2071. /* Flags */
  2072. rspamd_cryptobox_fast_hash_update (&crc_st, p + n * sizeof (gint),
  2073. n * sizeof (gint));
  2074. /* HS database */
  2075. p += n * sizeof (gint) * 2 + sizeof (guint64);
  2076. rspamd_cryptobox_fast_hash_update (&crc_st, p, end - p);
  2077. valid_crc = rspamd_cryptobox_fast_hash_final (&crc_st);
  2078. if (crc != valid_crc) {
  2079. msg_warn_re_cache ("outdated or invalid hs database in %s: "
  2080. "crc read %xL, crc expected %xL", path, crc, valid_crc);
  2081. munmap (map, len);
  2082. return FALSE;
  2083. }
  2084. if ((ret = hs_deserialize_database (p, end - p, &test_db))
  2085. != HS_SUCCESS) {
  2086. msg_err_re_cache ("bad hs database in %s: %d", path, ret);
  2087. munmap (map, len);
  2088. return FALSE;
  2089. }
  2090. hs_free_database (test_db);
  2091. munmap (map, len);
  2092. }
  2093. /* XXX: add crc check */
  2094. return TRUE;
  2095. }
  2096. }
  2097. if (!silent) {
  2098. msg_warn_re_cache ("unknown hyperscan cache file %s", path);
  2099. }
  2100. return FALSE;
  2101. #endif
  2102. }
  2103. enum rspamd_hyperscan_status
  2104. rspamd_re_cache_load_hyperscan (struct rspamd_re_cache *cache,
  2105. const char *cache_dir, bool try_load)
  2106. {
  2107. g_assert (cache != NULL);
  2108. g_assert (cache_dir != NULL);
  2109. #ifndef WITH_HYPERSCAN
  2110. return RSPAMD_HYPERSCAN_UNSUPPORTED;
  2111. #else
  2112. gchar path[PATH_MAX];
  2113. gint fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret;
  2114. GHashTableIter it;
  2115. gpointer k, v;
  2116. guint8 *map, *p, *end;
  2117. struct rspamd_re_class *re_class;
  2118. struct rspamd_re_cache_elt *elt;
  2119. struct stat st;
  2120. gboolean has_valid = FALSE, all_valid = FALSE;
  2121. g_hash_table_iter_init (&it, cache->re_classes);
  2122. while (g_hash_table_iter_next (&it, &k, &v)) {
  2123. re_class = v;
  2124. rspamd_snprintf (path, sizeof (path), "%s%c%s.hs", cache_dir,
  2125. G_DIR_SEPARATOR, re_class->hash);
  2126. if (rspamd_re_cache_is_valid_hyperscan_file (cache, path, try_load, FALSE)) {
  2127. msg_debug_re_cache ("load hyperscan database from '%s'",
  2128. re_class->hash);
  2129. fd = open (path, O_RDONLY);
  2130. /* Read number of regexps */
  2131. g_assert (fd != -1);
  2132. fstat (fd, &st);
  2133. map = mmap (NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
  2134. if (map == MAP_FAILED) {
  2135. if (!try_load) {
  2136. msg_err_re_cache ("cannot mmap %s: %s", path, strerror (errno));
  2137. }
  2138. else {
  2139. msg_debug_re_cache ("cannot mmap %s: %s", path, strerror (errno));
  2140. }
  2141. close (fd);
  2142. all_valid = FALSE;
  2143. continue;
  2144. }
  2145. close (fd);
  2146. end = map + st.st_size;
  2147. p = map + RSPAMD_HS_MAGIC_LEN + sizeof (cache->plt);
  2148. n = *(gint *)p;
  2149. if (n <= 0 || 2 * n * sizeof (gint) + /* IDs + flags */
  2150. sizeof (guint64) + /* crc */
  2151. RSPAMD_HS_MAGIC_LEN + /* header */
  2152. sizeof (cache->plt) > (gsize)st.st_size) {
  2153. /* Some wrong amount of regexps */
  2154. if (!try_load) {
  2155. msg_err_re_cache ("bad number of expressions in %s: %d",
  2156. path, n);
  2157. }
  2158. else {
  2159. msg_debug_re_cache ("bad number of expressions in %s: %d",
  2160. path, n);
  2161. }
  2162. munmap (map, st.st_size);
  2163. all_valid = FALSE;
  2164. continue;
  2165. }
  2166. total += n;
  2167. p += sizeof (n);
  2168. hs_ids = g_malloc (n * sizeof (*hs_ids));
  2169. memcpy (hs_ids, p, n * sizeof (*hs_ids));
  2170. p += n * sizeof (*hs_ids);
  2171. hs_flags = g_malloc (n * sizeof (*hs_flags));
  2172. memcpy (hs_flags, p, n * sizeof (*hs_flags));
  2173. /* Skip crc */
  2174. p += n * sizeof (*hs_ids) + sizeof (guint64);
  2175. /* Cleanup */
  2176. if (re_class->hs_scratch != NULL) {
  2177. hs_free_scratch (re_class->hs_scratch);
  2178. }
  2179. if (re_class->hs_db != NULL) {
  2180. hs_free_database (re_class->hs_db);
  2181. }
  2182. if (re_class->hs_ids) {
  2183. g_free (re_class->hs_ids);
  2184. }
  2185. re_class->hs_ids = NULL;
  2186. re_class->hs_scratch = NULL;
  2187. re_class->hs_db = NULL;
  2188. if ((ret = hs_deserialize_database (p, end - p, &re_class->hs_db))
  2189. != HS_SUCCESS) {
  2190. if (!try_load) {
  2191. msg_err_re_cache ("bad hs database in %s: %d", path, ret);
  2192. }
  2193. else {
  2194. msg_debug_re_cache ("bad hs database in %s: %d", path, ret);
  2195. }
  2196. munmap (map, st.st_size);
  2197. g_free (hs_ids);
  2198. g_free (hs_flags);
  2199. re_class->hs_ids = NULL;
  2200. re_class->hs_scratch = NULL;
  2201. re_class->hs_db = NULL;
  2202. all_valid = FALSE;
  2203. continue;
  2204. }
  2205. munmap (map, st.st_size);
  2206. g_assert (hs_alloc_scratch (re_class->hs_db,
  2207. &re_class->hs_scratch) == HS_SUCCESS);
  2208. /*
  2209. * Now find hyperscan elts that are successfully compiled and
  2210. * specify that they should be matched using hyperscan
  2211. */
  2212. for (i = 0; i < n; i ++) {
  2213. g_assert ((gint)cache->re->len > hs_ids[i] && hs_ids[i] >= 0);
  2214. elt = g_ptr_array_index (cache->re, hs_ids[i]);
  2215. if (hs_flags[i] & HS_FLAG_PREFILTER) {
  2216. elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE;
  2217. }
  2218. else {
  2219. elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN;
  2220. }
  2221. }
  2222. re_class->hs_ids = hs_ids;
  2223. g_free (hs_flags);
  2224. re_class->nhs = n;
  2225. if (!has_valid) {
  2226. has_valid = TRUE;
  2227. all_valid = TRUE;
  2228. }
  2229. }
  2230. else {
  2231. if (!try_load) {
  2232. msg_err_re_cache ("invalid hyperscan hash file '%s'",
  2233. path);
  2234. }
  2235. else {
  2236. msg_debug_re_cache ("invalid hyperscan hash file '%s'",
  2237. path);
  2238. }
  2239. all_valid = FALSE;
  2240. continue;
  2241. }
  2242. }
  2243. if (has_valid) {
  2244. if (all_valid) {
  2245. msg_info_re_cache ("full hyperscan database of %d regexps has been loaded", total);
  2246. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_FULL;
  2247. }
  2248. else {
  2249. msg_info_re_cache ("partial hyperscan database of %d regexps has been loaded", total);
  2250. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_PARTIAL;
  2251. }
  2252. }
  2253. else {
  2254. msg_info_re_cache ("hyperscan database has NOT been loaded; no valid expressions");
  2255. cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOAD_ERROR;
  2256. }
  2257. return cache->hyperscan_loaded;
  2258. #endif
  2259. }
  2260. void rspamd_re_cache_add_selector (struct rspamd_re_cache *cache,
  2261. const gchar *sname,
  2262. gint ref)
  2263. {
  2264. khiter_t k;
  2265. k = kh_get (lua_selectors_hash, cache->selectors, (gchar *)sname);
  2266. if (k == kh_end (cache->selectors)) {
  2267. gchar *cpy = g_strdup (sname);
  2268. gint res;
  2269. k = kh_put (lua_selectors_hash, cache->selectors, cpy, &res);
  2270. kh_value (cache->selectors, k) = ref;
  2271. }
  2272. else {
  2273. msg_warn_re_cache ("replacing selector with name %s", sname);
  2274. if (cache->L) {
  2275. luaL_unref (cache->L, LUA_REGISTRYINDEX, kh_value (cache->selectors, k));
  2276. }
  2277. kh_value (cache->selectors, k) = ref;
  2278. }
  2279. }