You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

stringzilla.h 254KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370
  1. /**
  2. * @brief StringZilla is a collection of simple string algorithms, designed to be used in Big Data applications.
  3. * It may be slower than LibC, but has a broader & cleaner interface, and a very short implementation
  4. * targeting modern x86 CPUs with AVX-512 and Arm NEON and older CPUs with SWAR and auto-vectorization.
  5. *
  6. * Consider overriding the following macros to customize the library:
  7. *
  8. * - `SZ_DEBUG=0` - whether to enable debug assertions and logging.
  9. * - `SZ_DYNAMIC_DISPATCH=0` - whether to use runtime dispatching of the most advanced SIMD backend.
  10. * - `SZ_USE_MISALIGNED_LOADS=0` - whether to use misaligned loads on platforms that support them.
  11. * - `SZ_SWAR_THRESHOLD=24` - threshold for switching to SWAR backend over serial byte-level for-loops.
  12. * - `SZ_USE_X86_AVX512=?` - whether to use AVX-512 instructions on x86_64.
  13. * - `SZ_USE_X86_AVX2=?` - whether to use AVX2 instructions on x86_64.
  14. * - `SZ_USE_ARM_NEON=?` - whether to use NEON instructions on ARM.
  15. * - `SZ_USE_ARM_SVE=?` - whether to use SVE instructions on ARM.
  16. *
  17. * @see StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
  18. * @see LibC String: https://pubs.opengroup.org/onlinepubs/009695399/basedefs/string.h.html
  19. *
  20. * @file stringzilla.h
  21. * @author Ash Vardanian
  22. */
  23. #ifndef STRINGZILLA_H_
  24. #define STRINGZILLA_H_
  25. #define STRINGZILLA_VERSION_MAJOR 3
  26. #define STRINGZILLA_VERSION_MINOR 5
  27. #define STRINGZILLA_VERSION_PATCH 0
  28. /**
  29. * @brief When set to 1, the library will include the following LibC headers: <stddef.h> and <stdint.h>.
  30. * In debug builds (SZ_DEBUG=1), the library will also include <stdio.h> and <stdlib.h>.
  31. *
  32. * You may want to disable this compiling for use in the kernel, or in embedded systems.
  33. * You may also avoid them, if you are very sensitive to compilation time and avoid pre-compiled headers.
  34. * https://artificial-mind.net/projects/compile-health/
  35. */
  36. #ifndef SZ_AVOID_LIBC
  37. #define SZ_AVOID_LIBC (0) // true or false
  38. #endif
  39. /**
  40. * @brief A misaligned load can be - trying to fetch eight consecutive bytes from an address
  41. * that is not divisible by eight. On x86 enabled by default. On ARM it's not.
  42. *
  43. * Most platforms support it, but there is no industry standard way to check for those.
  44. * This value will mostly affect the performance of the serial (SWAR) backend.
  45. */
  46. #ifndef SZ_USE_MISALIGNED_LOADS
  47. #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
  48. #define SZ_USE_MISALIGNED_LOADS (1) // true or false
  49. #else
  50. #define SZ_USE_MISALIGNED_LOADS (0) // true or false
  51. #endif
  52. #endif
  53. /**
  54. * @brief Removes compile-time dispatching, and replaces it with runtime dispatching.
  55. * So the `sz_find` function will invoke the most advanced backend supported by the CPU,
  56. * that runs the program, rather than the most advanced backend supported by the CPU
  57. * used to compile the library or the downstream application.
  58. */
  59. #ifndef SZ_DYNAMIC_DISPATCH
  60. #define SZ_DYNAMIC_DISPATCH (0) // true or false
  61. #endif
  62. /**
  63. * @brief Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
  64. * 64-bit on most platforms where pointers are 64-bit.
  65. * 32-bit on platforms where pointers are 32-bit.
  66. */
  67. #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
  68. #define SZ_DETECT_64_BIT (1)
  69. #define SZ_SIZE_MAX (0xFFFFFFFFFFFFFFFFull) // Largest unsigned integer that fits into 64 bits.
  70. #define SZ_SSIZE_MAX (0x7FFFFFFFFFFFFFFFull) // Largest signed integer that fits into 64 bits.
  71. #else
  72. #define SZ_DETECT_64_BIT (0)
  73. #define SZ_SIZE_MAX (0xFFFFFFFFu) // Largest unsigned integer that fits into 32 bits.
  74. #define SZ_SSIZE_MAX (0x7FFFFFFFu) // Largest signed integer that fits into 32 bits.
  75. #endif
  76. /**
  77. * @brief On Big-Endian machines StringZilla will work in compatibility mode.
  78. * This disables SWAR hacks to minimize code duplication, assuming practically
  79. * all modern popular platforms are Little-Endian.
  80. *
  81. * This variable is hard to infer from macros reliably. It's best to set it manually.
  82. * For that CMake provides the `TestBigEndian` and `CMAKE_<LANG>_BYTE_ORDER` (from 3.20 onwards).
  83. * In Python one can check `sys.byteorder == 'big'` in the `setup.py` script and pass the appropriate macro.
  84. * https://stackoverflow.com/a/27054190
  85. */
  86. #ifndef SZ_DETECT_BIG_ENDIAN
  87. #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \
  88. defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
  89. #define SZ_DETECT_BIG_ENDIAN (1) //< It's a big-endian target architecture
  90. #else
  91. #define SZ_DETECT_BIG_ENDIAN (0) //< It's a little-endian target architecture
  92. #endif
  93. #endif
  94. /*
  95. * Debugging and testing.
  96. */
  97. #ifndef SZ_DEBUG
  98. #if defined(DEBUG) || defined(_DEBUG) // This means "Not using DEBUG information".
  99. #define SZ_DEBUG (1)
  100. #else
  101. #define SZ_DEBUG (0)
  102. #endif
  103. #endif
  104. /**
  105. * @brief Threshold for switching to SWAR (8-bytes at a time) backend over serial byte-level for-loops.
  106. * On very short strings, under 16 bytes long, at most a single word will be processed with SWAR.
  107. * Assuming potentially misaligned loads, SWAR makes sense only after ~24 bytes.
  108. */
  109. #ifndef SZ_SWAR_THRESHOLD
  110. #if SZ_DEBUG
  111. #define SZ_SWAR_THRESHOLD (8u) // 8 bytes in debug builds
  112. #else
  113. #define SZ_SWAR_THRESHOLD (24u) // 24 bytes in release builds
  114. #endif
  115. #endif
  116. /* Annotation for the public API symbols:
  117. *
  118. * - `SZ_PUBLIC` is used for functions that are part of the public API.
  119. * - `SZ_INTERNAL` is used for internal helper functions with unstable APIs.
  120. * - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
  121. */
  122. #ifndef SZ_DYNAMIC
  123. #if SZ_DYNAMIC_DISPATCH
  124. #if defined(_WIN32) || defined(__CYGWIN__)
  125. #define SZ_DYNAMIC __declspec(dllexport)
  126. #define SZ_PUBLIC inline static
  127. #define SZ_INTERNAL inline static
  128. #else
  129. #define SZ_DYNAMIC __attribute__((visibility("default")))
  130. #define SZ_PUBLIC __attribute__((unused)) inline static
  131. #define SZ_INTERNAL __attribute__((always_inline)) inline static
  132. #endif // _WIN32 || __CYGWIN__
  133. #else
  134. #define SZ_DYNAMIC inline static
  135. #define SZ_PUBLIC inline static
  136. #define SZ_INTERNAL inline static
  137. #endif // SZ_DYNAMIC_DISPATCH
  138. #endif // SZ_DYNAMIC
  139. #ifdef __cplusplus
  140. extern "C" {
  141. #endif
  142. /*
  143. * Let's infer the integer types or pull them from LibC,
  144. * if that is allowed by the user.
  145. */
  146. #if !SZ_AVOID_LIBC
  147. #include <stddef.h> // `size_t`
  148. #include <stdint.h> // `uint8_t`
  149. typedef int8_t sz_i8_t; // Always 8 bits
  150. typedef uint8_t sz_u8_t; // Always 8 bits
  151. typedef uint16_t sz_u16_t; // Always 16 bits
  152. typedef int32_t sz_i32_t; // Always 32 bits
  153. typedef uint32_t sz_u32_t; // Always 32 bits
  154. typedef uint64_t sz_u64_t; // Always 64 bits
  155. typedef int64_t sz_i64_t; // Always 64 bits
  156. typedef size_t sz_size_t; // Pointer-sized unsigned integer, 32 or 64 bits
  157. typedef ptrdiff_t sz_ssize_t; // Signed version of `sz_size_t`, 32 or 64 bits
  158. #else // if SZ_AVOID_LIBC:
  159. typedef signed char sz_i8_t; // Always 8 bits
  160. typedef unsigned char sz_u8_t; // Always 8 bits
  161. typedef unsigned short sz_u16_t; // Always 16 bits
  162. typedef int sz_i32_t; // Always 32 bits
  163. typedef unsigned int sz_u32_t; // Always 32 bits
  164. typedef long long sz_i64_t; // Always 64 bits
  165. typedef unsigned long long sz_u64_t; // Always 64 bits
  166. #if SZ_DETECT_64_BIT
  167. typedef unsigned long long sz_size_t; // 64-bit.
  168. typedef long long sz_ssize_t; // 64-bit.
  169. #else
  170. typedef unsigned sz_size_t; // 32-bit.
  171. typedef unsigned sz_ssize_t; // 32-bit.
  172. #endif // SZ_DETECT_64_BIT
  173. #endif // SZ_AVOID_LIBC
  174. /**
  175. * @brief Compile-time assert macro similar to `static_assert` in C++.
  176. */
  177. #define sz_static_assert(condition, name) \
  178. typedef struct { \
  179. int static_assert_##name : (condition) ? 1 : -1; \
  180. } sz_static_assert_##name##_t
  181. sz_static_assert(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
  182. sz_static_assert(sizeof(sz_ssize_t) == sizeof(void *), sz_ssize_t_must_be_pointer_size);
  183. #pragma region Public API
  184. typedef char *sz_ptr_t; // A type alias for `char *`
  185. typedef char const *sz_cptr_t; // A type alias for `char const *`
  186. typedef sz_i8_t sz_error_cost_t; // Character mismatch cost for fuzzy matching functions
  187. typedef sz_u64_t sz_sorted_idx_t; // Index of a sorted string in a list of strings
  188. typedef enum { sz_false_k = 0, sz_true_k = 1 } sz_bool_t; // Only one relevant bit
  189. typedef enum { sz_less_k = -1, sz_equal_k = 0, sz_greater_k = 1 } sz_ordering_t; // Only three possible states: <=>
  190. /**
  191. * @brief Tiny string-view structure. It's POD type, unlike the `std::string_view`.
  192. */
  193. typedef struct sz_string_view_t {
  194. sz_cptr_t start;
  195. sz_size_t length;
  196. } sz_string_view_t;
  197. /**
  198. * @brief Enumeration of SIMD capabilities of the target architecture.
  199. * Used to introspect the supported functionality of the dynamic library.
  200. */
  201. typedef enum sz_capability_t {
  202. sz_cap_serial_k = 1, /// Serial (non-SIMD) capability
  203. sz_cap_any_k = 0x7FFFFFFF, /// Mask representing any capability
  204. sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability
  205. sz_cap_arm_sve_k = 1 << 11, /// ARM SVE capability TODO: Not yet supported or used
  206. sz_cap_x86_avx2_k = 1 << 20, /// x86 AVX2 capability
  207. sz_cap_x86_avx512f_k = 1 << 21, /// x86 AVX512 F capability
  208. sz_cap_x86_avx512bw_k = 1 << 22, /// x86 AVX512 BW instruction capability
  209. sz_cap_x86_avx512vl_k = 1 << 23, /// x86 AVX512 VL instruction capability
  210. sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability
  211. sz_cap_x86_gfni_k = 1 << 25, /// x86 AVX512 GFNI instruction capability
  212. } sz_capability_t;
  213. /**
  214. * @brief Function to determine the SIMD capabilities of the current machine @b only at @b runtime.
  215. * @return A bitmask of the SIMD capabilities represented as a `sz_capability_t` enum value.
  216. */
  217. SZ_DYNAMIC sz_capability_t sz_capabilities(void);
  218. /**
  219. * @brief Bit-set structure for 256 possible byte values. Useful for filtering and search.
  220. * @see sz_charset_init, sz_charset_add, sz_charset_contains, sz_charset_invert
  221. */
  222. typedef union sz_charset_t {
  223. sz_u64_t _u64s[4];
  224. sz_u32_t _u32s[8];
  225. sz_u16_t _u16s[16];
  226. sz_u8_t _u8s[32];
  227. } sz_charset_t;
  228. /** @brief Initializes a bit-set to an empty collection, meaning - all characters are banned. */
  229. SZ_PUBLIC void sz_charset_init(sz_charset_t *s) { s->_u64s[0] = s->_u64s[1] = s->_u64s[2] = s->_u64s[3] = 0; }
  230. /** @brief Adds a character to the set and accepts @b unsigned integers. */
  231. SZ_PUBLIC void sz_charset_add_u8(sz_charset_t *s, sz_u8_t c) { s->_u64s[c >> 6] |= (1ull << (c & 63u)); }
  232. /** @brief Adds a character to the set. Consider @b sz_charset_add_u8. */
  233. SZ_PUBLIC void sz_charset_add(sz_charset_t *s, char c) { sz_charset_add_u8(s, *(sz_u8_t *)(&c)); } // bitcast
  234. /** @brief Checks if the set contains a given character and accepts @b unsigned integers. */
  235. SZ_PUBLIC sz_bool_t sz_charset_contains_u8(sz_charset_t const *s, sz_u8_t c) {
  236. // Checking the bit can be done in different ways:
  237. // - (s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0
  238. // - (s->_u32s[c >> 5] & (1u << (c & 31u))) != 0
  239. // - (s->_u16s[c >> 4] & (1u << (c & 15u))) != 0
  240. // - (s->_u8s[c >> 3] & (1u << (c & 7u))) != 0
  241. return (sz_bool_t)((s->_u64s[c >> 6] & (1ull << (c & 63u))) != 0);
  242. }
  243. /** @brief Checks if the set contains a given character. Consider @b sz_charset_contains_u8. */
  244. SZ_PUBLIC sz_bool_t sz_charset_contains(sz_charset_t const *s, char c) {
  245. return sz_charset_contains_u8(s, *(sz_u8_t *)(&c)); // bitcast
  246. }
  247. /** @brief Inverts the contents of the set, so allowed character get disallowed, and vice versa. */
  248. SZ_PUBLIC void sz_charset_invert(sz_charset_t *s) {
  249. s->_u64s[0] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[1] ^= 0xFFFFFFFFFFFFFFFFull, //
  250. s->_u64s[2] ^= 0xFFFFFFFFFFFFFFFFull, s->_u64s[3] ^= 0xFFFFFFFFFFFFFFFFull;
  251. }
  252. typedef void *(*sz_memory_allocate_t)(sz_size_t, void *);
  253. typedef void (*sz_memory_free_t)(void *, sz_size_t, void *);
  254. typedef sz_u64_t (*sz_random_generator_t)(void *);
  255. /**
  256. * @brief Some complex pattern matching algorithms may require memory allocations.
  257. * This structure is used to pass the memory allocator to those functions.
  258. * @see sz_memory_allocator_init_fixed
  259. */
  260. typedef struct sz_memory_allocator_t {
  261. sz_memory_allocate_t allocate;
  262. sz_memory_free_t free;
  263. void *handle;
  264. } sz_memory_allocator_t;
  265. /**
  266. * @brief Initializes a memory allocator to use the system default `malloc` and `free`.
  267. * ! The function is not available if the library was compiled with `SZ_AVOID_LIBC`.
  268. *
  269. * @param alloc Memory allocator to initialize.
  270. */
  271. SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc);
  272. /**
  273. * @brief Initializes a memory allocator to use a static-capacity buffer.
  274. * No dynamic allocations will be performed.
  275. *
  276. * @param alloc Memory allocator to initialize.
  277. * @param buffer Buffer to use for allocations.
  278. * @param length Length of the buffer. @b Must be greater than 8 bytes. Different values would be optimal for
  279. * different algorithms and input lengths, but 4096 bytes (one RAM page) is a good default.
  280. */
  281. SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length);
  282. /**
  283. * @brief The number of bytes a stack-allocated string can hold, including the SZ_NULL termination character.
  284. * ! This can't be changed from outside. Don't use the `#error` as it may already be included and set.
  285. */
  286. #ifdef SZ_STRING_INTERNAL_SPACE
  287. #undef SZ_STRING_INTERNAL_SPACE
  288. #endif
  289. #define SZ_STRING_INTERNAL_SPACE (sizeof(sz_size_t) * 3 - 1) // 3 pointers minus one byte for an 8-bit length
  290. /**
  291. * @brief Tiny memory-owning string structure with a Small String Optimization (SSO).
  292. * Differs in layout from Folly, Clang, GCC, and probably most other implementations.
  293. * It's designed to avoid any branches on read-only operations, and can store up
  294. * to 22 characters on stack on 64-bit machines, followed by the SZ_NULL-termination character.
  295. *
  296. * @section Changing Length
  297. *
  298. * One nice thing about this design, is that you can, in many cases, change the length of the string
  299. * without any branches, invoking a `+=` or `-=` on the 64-bit `length` field. If the string is on heap,
  300. * the solution is obvious. If it's on stack, inplace decrement wouldn't affect the top bytes of the string,
  301. * only changing the last byte containing the length.
  302. */
  303. typedef union sz_string_t {
  304. #if !SZ_DETECT_BIG_ENDIAN
  305. struct external {
  306. sz_ptr_t start;
  307. sz_size_t length;
  308. sz_size_t space;
  309. sz_size_t padding;
  310. } external;
  311. struct internal {
  312. sz_ptr_t start;
  313. sz_u8_t length;
  314. char chars[SZ_STRING_INTERNAL_SPACE];
  315. } internal;
  316. #else
  317. struct external {
  318. sz_ptr_t start;
  319. sz_size_t space;
  320. sz_size_t padding;
  321. sz_size_t length;
  322. } external;
  323. struct internal {
  324. sz_ptr_t start;
  325. char chars[SZ_STRING_INTERNAL_SPACE];
  326. sz_u8_t length;
  327. } internal;
  328. #endif
  329. sz_size_t words[4];
  330. } sz_string_t;
  331. typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
  332. typedef sz_bool_t (*sz_equal_t)(sz_cptr_t, sz_cptr_t, sz_size_t);
  333. typedef sz_ordering_t (*sz_order_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
  334. typedef void (*sz_to_converter_t)(sz_cptr_t, sz_size_t, sz_ptr_t);
  335. /**
  336. * @brief Computes the 64-bit unsigned hash of a string. Fairly fast for short strings,
  337. * simple implementation, and supports rolling computation, reused in other APIs.
  338. * Similar to `std::hash` in C++.
  339. *
  340. * @param text String to hash.
  341. * @param length Number of bytes in the text.
  342. * @return 64-bit hash value.
  343. *
  344. * @see sz_hashes, sz_hashes_fingerprint, sz_hashes_intersection
  345. */
  346. SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length);
  347. /** @copydoc sz_hash */
  348. SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length);
  349. /**
  350. * @brief Checks if two string are equal.
  351. * Similar to `memcmp(a, b, length) == 0` in LibC and `a == b` in STL.
  352. *
  353. * The implementation of this function is very similar to `sz_order`, but the usage patterns are different.
  354. * This function is more often used in parsing, while `sz_order` is often used in sorting.
  355. * It works best on platforms with cheap
  356. *
  357. * @param a First string to compare.
  358. * @param b Second string to compare.
  359. * @param length Number of bytes in both strings.
  360. * @return 1 if strings match, 0 otherwise.
  361. */
  362. SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
  363. /** @copydoc sz_equal */
  364. SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
  365. /**
  366. * @brief Estimates the relative order of two strings. Equivalent to `memcmp(a, b, length)` in LibC.
  367. * Can be used on different length strings.
  368. *
  369. * @param a First string to compare.
  370. * @param a_length Number of bytes in the first string.
  371. * @param b Second string to compare.
  372. * @param b_length Number of bytes in the second string.
  373. * @return Negative if (a < b), positive if (a > b), zero if they are equal.
  374. */
  375. SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
  376. /** @copydoc sz_order */
  377. SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
  378. /**
  379. * @brief Equivalent to `for (char & c : text) c = tolower(c)`.
  380. *
  381. * ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
  382. * So there are 26 english letters, shifted by 32 values, meaning that a conversion
  383. * can be done by flipping the 5th bit each inappropriate character byte. This, however,
  384. * breaks for extended ASCII, so a different solution is needed.
  385. * http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
  386. *
  387. * @param text String to be normalized.
  388. * @param length Number of bytes in the string.
  389. * @param result Output string, can point to the same address as ::text.
  390. */
  391. SZ_PUBLIC void sz_tolower(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
  392. /**
  393. * @brief Equivalent to `for (char & c : text) c = toupper(c)`.
  394. *
  395. * ASCII characters [A, Z] map to decimals [65, 90], and [a, z] map to [97, 122].
  396. * So there are 26 english letters, shifted by 32 values, meaning that a conversion
  397. * can be done by flipping the 5th bit each inappropriate character byte. This, however,
  398. * breaks for extended ASCII, so a different solution is needed.
  399. * http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
  400. *
  401. * @param text String to be normalized.
  402. * @param length Number of bytes in the string.
  403. * @param result Output string, can point to the same address as ::text.
  404. */
  405. SZ_PUBLIC void sz_toupper(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
  406. /**
  407. * @brief Equivalent to `for (char & c : text) c = toascii(c)`.
  408. *
  409. * @param text String to be normalized.
  410. * @param length Number of bytes in the string.
  411. * @param result Output string, can point to the same address as ::text.
  412. */
  413. SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
  414. /**
  415. * @brief Checks if all characters in the range are valid ASCII characters.
  416. *
  417. * @param text String to be analyzed.
  418. * @param length Number of bytes in the string.
  419. * @return Whether all characters are valid ASCII characters.
  420. */
  421. SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t text, sz_size_t length);
  422. /**
  423. * @brief Generates a random string for a given alphabet, avoiding integer division and modulo operations.
  424. * Similar to `text[i] = alphabet[rand() % cardinality]`.
  425. *
  426. * The modulo operation is expensive, and should be avoided in performance-critical code.
  427. * We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to `libdivide`.
  428. * Alternative algorithms would include:
  429. * - Montgomery form: https://en.algorithmica.org/hpc/number-theory/montgomery/
  430. * - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
  431. * - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
  432. *
  433. * @param alphabet Set of characters to sample from.
  434. * @param cardinality Number of characters to sample from.
  435. * @param text Output string, can point to the same address as ::text.
  436. * @param generate Callback producing random numbers given the generator state.
  437. * @param generator Generator state, can be a pointer to a seed, or a pointer to a random number generator.
  438. */
  439. SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
  440. sz_random_generator_t generate, void *generator);
  441. /** @copydoc sz_generate */
  442. SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
  443. sz_random_generator_t generate, void *generator);
  444. /**
  445. * @brief Similar to `memcpy`, copies contents of one string into another.
  446. * The behavior is undefined if the strings overlap.
  447. *
  448. * @param target String to copy into.
  449. * @param length Number of bytes to copy.
  450. * @param source String to copy from.
  451. */
  452. SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  453. /** @copydoc sz_copy */
  454. SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  455. /**
  456. * @brief Similar to `memmove`, copies (moves) contents of one string into another.
  457. * Unlike `sz_copy`, allows overlapping strings as arguments.
  458. *
  459. * @param target String to copy into.
  460. * @param length Number of bytes to copy.
  461. * @param source String to copy from.
  462. */
  463. SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  464. /** @copydoc sz_move */
  465. SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  466. typedef void (*sz_move_t)(sz_ptr_t, sz_cptr_t, sz_size_t);
  467. /**
  468. * @brief Similar to `memset`, fills a string with a given value.
  469. *
  470. * @param target String to fill.
  471. * @param length Number of bytes to fill.
  472. * @param value Value to fill with.
  473. */
  474. SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value);
  475. /** @copydoc sz_fill */
  476. SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value);
  477. typedef void (*sz_fill_t)(sz_ptr_t, sz_size_t, sz_u8_t);
  478. /**
  479. * @brief Initializes a string class instance to an empty value.
  480. */
  481. SZ_PUBLIC void sz_string_init(sz_string_t *string);
  482. /**
  483. * @brief Convenience function checking if the provided string is stored inside of the ::string instance itself,
  484. * alternative being - allocated in a remote region of the heap.
  485. */
  486. SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string);
  487. /**
  488. * @brief Unpacks the opaque instance of a string class into its components.
  489. * Recommended to use only in read-only operations.
  490. *
  491. * @param string String to unpack.
  492. * @param start Pointer to the start of the string.
  493. * @param length Number of bytes in the string, before the SZ_NULL character.
  494. * @param space Number of bytes allocated for the string (heap or stack), including the SZ_NULL character.
  495. * @param is_external Whether the string is allocated on the heap externally, or fits withing ::string instance.
  496. */
  497. SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
  498. sz_bool_t *is_external);
  499. /**
  500. * @brief Unpacks only the start and length of the string.
  501. * Recommended to use only in read-only operations.
  502. *
  503. * @param string String to unpack.
  504. * @param start Pointer to the start of the string.
  505. * @param length Number of bytes in the string, before the SZ_NULL character.
  506. */
  507. SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length);
  508. /**
  509. * @brief Constructs a string of a given ::length with noisy contents.
  510. * Use the returned character pointer to populate the string.
  511. *
  512. * @param string String to initialize.
  513. * @param length Number of bytes in the string, before the SZ_NULL character.
  514. * @param allocator Memory allocator to use for the allocation.
  515. * @return SZ_NULL if the operation failed, pointer to the start of the string otherwise.
  516. */
  517. SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator);
  518. /**
  519. * @brief Doesn't change the contents or the length of the string, but grows the available memory capacity.
  520. * This is beneficial, if several insertions are expected, and we want to minimize allocations.
  521. *
  522. * @param string String to grow.
  523. * @param new_capacity The number of characters to reserve space for, including existing ones.
  524. * @param allocator Memory allocator to use for the allocation.
  525. * @return SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
  526. */
  527. SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator);
  528. /**
  529. * @brief Grows the string by adding an uninitialized region of ::added_length at the given ::offset.
  530. * Would often be used in conjunction with one or more `sz_copy` calls to populate the allocated region.
  531. * Similar to `sz_string_reserve`, but changes the length of the ::string.
  532. *
  533. * @param string String to grow.
  534. * @param offset Offset of the first byte to reserve space for.
  535. * If provided offset is larger than the length, it will be capped.
  536. * @param added_length The number of new characters to reserve space for.
  537. * @param allocator Memory allocator to use for the allocation.
  538. * @return SZ_NULL if the operation failed, pointer to the new start of the string otherwise.
  539. */
  540. SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
  541. sz_memory_allocator_t *allocator);
  542. /**
  543. * @brief Removes a range from a string. Changes the length, but not the capacity.
  544. * Performs no allocations or deallocations and can't fail.
  545. *
  546. * @param string String to clean.
  547. * @param offset Offset of the first byte to remove.
  548. * @param length Number of bytes to remove. Out-of-bound ranges will be capped.
  549. * @return Number of bytes removed.
  550. */
  551. SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length);
  552. /**
  553. * @brief Shrinks the string to fit the current length, if it's allocated on the heap.
  554. * Teh reverse operation of ::sz_string_reserve.
  555. *
  556. * @param string String to shrink.
  557. * @param allocator Memory allocator to use for the allocation.
  558. * @return Whether the operation was successful. The only failures can come from the allocator.
  559. */
  560. SZ_PUBLIC sz_ptr_t sz_string_shrink_to_fit(sz_string_t *string, sz_memory_allocator_t *allocator);
  561. /**
  562. * @brief Frees the string, if it's allocated on the heap.
  563. * If the string is on the stack, the function clears/resets the state.
  564. */
  565. SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator);
  566. #pragma endregion
  567. #pragma region Fast Substring Search API
  568. typedef sz_cptr_t (*sz_find_byte_t)(sz_cptr_t, sz_size_t, sz_cptr_t);
  569. typedef sz_cptr_t (*sz_find_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
  570. typedef sz_cptr_t (*sz_find_set_t)(sz_cptr_t, sz_size_t, sz_charset_t const *);
  571. /**
  572. * @brief Locates first matching byte in a string. Equivalent to `memchr(haystack, *needle, h_length)` in LibC.
  573. *
  574. * X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S
  575. * Aarch64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/aarch64/memchr.S
  576. *
  577. * @param haystack Haystack - the string to search in.
  578. * @param h_length Number of bytes in the haystack.
  579. * @param needle Needle - single-byte substring to find.
  580. * @return Address of the first match.
  581. */
  582. SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  583. /** @copydoc sz_find_byte */
  584. SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  585. /**
  586. * @brief Locates last matching byte in a string. Equivalent to `memrchr(haystack, *needle, h_length)` in LibC.
  587. *
  588. * X86_64 implementation: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memrchr.S
  589. * Aarch64 implementation: missing
  590. *
  591. * @param haystack Haystack - the string to search in.
  592. * @param h_length Number of bytes in the haystack.
  593. * @param needle Needle - single-byte substring to find.
  594. * @return Address of the last match.
  595. */
  596. SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  597. /** @copydoc sz_rfind_byte */
  598. SZ_PUBLIC sz_cptr_t sz_rfind_byte_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  599. /**
  600. * @brief Locates first matching substring.
  601. * Equivalent to `memmem(haystack, h_length, needle, n_length)` in LibC.
  602. * Similar to `strstr(haystack, needle)` in LibC, but requires known length.
  603. *
  604. * @param haystack Haystack - the string to search in.
  605. * @param h_length Number of bytes in the haystack.
  606. * @param needle Needle - substring to find.
  607. * @param n_length Number of bytes in the needle.
  608. * @return Address of the first match.
  609. */
  610. SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  611. /** @copydoc sz_find */
  612. SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  613. /**
  614. * @brief Locates the last matching substring.
  615. *
  616. * @param haystack Haystack - the string to search in.
  617. * @param h_length Number of bytes in the haystack.
  618. * @param needle Needle - substring to find.
  619. * @param n_length Number of bytes in the needle.
  620. * @return Address of the last match.
  621. */
  622. SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  623. /** @copydoc sz_rfind */
  624. SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  625. /**
  626. * @brief Finds the first character present from the ::set, present in ::text.
  627. * Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
  628. * May have identical implementation and performance to ::sz_rfind_charset.
  629. *
  630. * @param text String to be trimmed.
  631. * @param accepted Set of accepted characters.
  632. * @return Number of bytes forming the prefix.
  633. */
  634. SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
  635. /** @copydoc sz_find_charset */
  636. SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
  637. /**
  638. * @brief Finds the last character present from the ::set, present in ::text.
  639. * Equivalent to `strspn(text, accepted)` and `strcspn(text, rejected)` in LibC.
  640. * May have identical implementation and performance to ::sz_find_charset.
  641. *
  642. * Useful for parsing, when we want to skip a set of characters. Examples:
  643. * * 6 whitespaces: " \t\n\r\v\f".
  644. * * 16 digits forming a float number: "0123456789,.eE+-".
  645. * * 5 HTML reserved characters: "\"'&<>", of which "<>" can be useful for parsing.
  646. * * 2 JSON string special characters useful to locate the end of the string: "\"\\".
  647. *
  648. * @param text String to be trimmed.
  649. * @param rejected Set of rejected characters.
  650. * @return Number of bytes forming the prefix.
  651. */
  652. SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
  653. /** @copydoc sz_rfind_charset */
  654. SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
  655. #pragma endregion
  656. #pragma region String Similarity Measures API
  657. /**
  658. * @brief Computes the Hamming distance between two strings - number of not matching characters.
  659. * Difference in length is is counted as a mismatch.
  660. *
  661. * @param a First string to compare.
  662. * @param a_length Number of bytes in the first string.
  663. * @param b Second string to compare.
  664. * @param b_length Number of bytes in the second string.
  665. *
  666. * @param bound Upper bound on the distance, that allows us to exit early.
  667. * If zero is passed, the maximum possible distance will be equal to the length of the longer input.
  668. * @return Unsigned integer for the distance, the `bound` if was exceeded.
  669. *
  670. * @see sz_hamming_distance_utf8
  671. * @see https://en.wikipedia.org/wiki/Hamming_distance
  672. */
  673. SZ_DYNAMIC sz_size_t sz_hamming_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
  674. sz_size_t bound);
  675. /** @copydoc sz_hamming_distance */
  676. SZ_PUBLIC sz_size_t sz_hamming_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
  677. sz_size_t bound);
  678. /**
  679. * @brief Computes the Hamming distance between two @b UTF8 strings - number of not matching characters.
  680. * Difference in length is is counted as a mismatch.
  681. *
  682. * @param a First string to compare.
  683. * @param a_length Number of bytes in the first string.
  684. * @param b Second string to compare.
  685. * @param b_length Number of bytes in the second string.
  686. *
  687. * @param bound Upper bound on the distance, that allows us to exit early.
  688. * If zero is passed, the maximum possible distance will be equal to the length of the longer input.
  689. * @return Unsigned integer for the distance, the `bound` if was exceeded.
  690. *
  691. * @see sz_hamming_distance
  692. * @see https://en.wikipedia.org/wiki/Hamming_distance
  693. */
  694. SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
  695. sz_size_t bound);
  696. /** @copydoc sz_hamming_distance_utf8 */
  697. SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
  698. sz_size_t bound);
  699. typedef sz_size_t (*sz_hamming_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t);
  700. /**
  701. * @brief Computes the Levenshtein edit-distance between two strings using the Wagner-Fisher algorithm.
  702. * Similar to the Needleman-Wunsch alignment algorithm. Often used in fuzzy string matching.
  703. *
  704. * @param a First string to compare.
  705. * @param a_length Number of bytes in the first string.
  706. * @param b Second string to compare.
  707. * @param b_length Number of bytes in the second string.
  708. *
  709. * @param alloc Temporary memory allocator. Only some of the rows of the matrix will be allocated,
  710. * so the memory usage is linear in relation to ::a_length and ::b_length.
  711. * If SZ_NULL is passed, will initialize to the systems default `malloc`.
  712. * @param bound Upper bound on the distance, that allows us to exit early.
  713. * If zero is passed, the maximum possible distance will be equal to the length of the longer input.
  714. * @return Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
  715. * if the memory allocation failed.
  716. *
  717. * @see sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
  718. * @see https://en.wikipedia.org/wiki/Levenshtein_distance
  719. */
  720. SZ_DYNAMIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
  721. sz_size_t bound, sz_memory_allocator_t *alloc);
  722. /** @copydoc sz_edit_distance */
  723. SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
  724. sz_size_t bound, sz_memory_allocator_t *alloc);
  725. /**
  726. * @brief Computes the Levenshtein edit-distance between two @b UTF8 strings.
  727. * Unlike `sz_edit_distance`, reports the distance in Unicode codepoints, and not in bytes.
  728. *
  729. * @param a First string to compare.
  730. * @param a_length Number of bytes in the first string.
  731. * @param b Second string to compare.
  732. * @param b_length Number of bytes in the second string.
  733. *
  734. * @param alloc Temporary memory allocator. Only some of the rows of the matrix will be allocated,
  735. * so the memory usage is linear in relation to ::a_length and ::b_length.
  736. * If SZ_NULL is passed, will initialize to the systems default `malloc`.
  737. * @param bound Upper bound on the distance, that allows us to exit early.
  738. * If zero is passed, the maximum possible distance will be equal to the length of the longer input.
  739. * @return Unsigned integer for edit distance, the `bound` if was exceeded or `SZ_SIZE_MAX`
  740. * if the memory allocation failed.
  741. *
  742. * @see sz_memory_allocator_init_fixed, sz_memory_allocator_init_default, sz_edit_distance
  743. * @see https://en.wikipedia.org/wiki/Levenshtein_distance
  744. */
  745. SZ_DYNAMIC sz_size_t sz_edit_distance_utf8(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
  746. sz_size_t bound, sz_memory_allocator_t *alloc);
  747. typedef sz_size_t (*sz_edit_distance_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_size_t, sz_memory_allocator_t *);
  748. /** @copydoc sz_edit_distance_utf8 */
  749. SZ_PUBLIC sz_size_t sz_edit_distance_utf8_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
  750. sz_size_t bound, sz_memory_allocator_t *alloc);
  751. /**
  752. * @brief Computes Needleman–Wunsch alignment score for two string. Often used in bioinformatics and cheminformatics.
  753. * Similar to the Levenshtein edit-distance, parameterized for gap and substitution penalties.
  754. *
  755. * Not commutative in the general case, as the order of the strings matters, as `sz_alignment_score(a, b)` may
  756. * not be equal to `sz_alignment_score(b, a)`. Becomes @b commutative, if the substitution costs are symmetric.
  757. * Equivalent to the negative Levenshtein distance, if: `gap == -1` and `subs[i][j] == (i == j ? 0: -1)`.
  758. *
  759. * @param a First string to compare.
  760. * @param a_length Number of bytes in the first string.
  761. * @param b Second string to compare.
  762. * @param b_length Number of bytes in the second string.
  763. * @param gap Penalty cost for gaps - insertions and removals.
  764. * @param subs Substitution costs matrix with 256 x 256 values for all pairs of characters.
  765. *
  766. * @param alloc Temporary memory allocator. Only some of the rows of the matrix will be allocated,
  767. * so the memory usage is linear in relation to ::a_length and ::b_length.
  768. * If SZ_NULL is passed, will initialize to the systems default `malloc`.
  769. * @return Signed similarity score. Can be negative, depending on the substitution costs.
  770. * If the memory allocation fails, the function returns `SZ_SSIZE_MAX`.
  771. *
  772. * @see sz_memory_allocator_init_fixed, sz_memory_allocator_init_default
  773. * @see https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
  774. */
  775. SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
  776. sz_error_cost_t const *subs, sz_error_cost_t gap, //
  777. sz_memory_allocator_t *alloc);
  778. /** @copydoc sz_alignment_score */
  779. SZ_PUBLIC sz_ssize_t sz_alignment_score_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
  780. sz_error_cost_t const *subs, sz_error_cost_t gap, //
  781. sz_memory_allocator_t *alloc);
  782. typedef sz_ssize_t (*sz_alignment_score_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t, sz_error_cost_t const *,
  783. sz_error_cost_t, sz_memory_allocator_t *);
  784. typedef void (*sz_hash_callback_t)(sz_cptr_t, sz_size_t, sz_u64_t, void *user);
  785. /**
  786. * @brief Computes the Karp-Rabin rolling hashes of a string supplying them to the provided `callback`.
  787. * Can be used for similarity scores, search, ranking, etc.
  788. *
  789. * Rabin-Karp-like rolling hashes can have very high-level of collisions and depend
  790. * on the choice of bases and the prime number. That's why, often two hashes from the same
  791. * family are used with different bases.
  792. *
  793. * 1. Kernighan and Ritchie's function uses 31, a prime close to the size of English alphabet.
  794. * 2. To be friendlier to byte-arrays and UTF8, we use 257 for the second function.
  795. *
  796. * Choosing the right ::window_length is task- and domain-dependant. For example, most English words are
  797. * between 3 and 7 characters long, so a window of 4 bytes would be a good choice. For DNA sequences,
  798. * the ::window_length might be a multiple of 3, as the codons are 3 (nucleotides) bytes long.
  799. * With such minimalistic alphabets of just four characters (AGCT) longer windows might be needed.
  800. * For protein sequences the alphabet is 20 characters long, so the window can be shorter, than for DNAs.
  801. *
  802. * @param text String to hash.
  803. * @param length Number of bytes in the string.
  804. * @param window_length Length of the rolling window in bytes.
  805. * @param window_step Step of reported hashes. @b Must be power of two. Should be smaller than `window_length`.
  806. * @param callback Function receiving the start & length of a substring, the hash, and the `callback_handle`.
  807. * @param callback_handle Optional user-provided pointer to be passed to the `callback`.
  808. * @see sz_hashes_fingerprint, sz_hashes_intersection
  809. */
  810. SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
  811. sz_hash_callback_t callback, void *callback_handle);
  812. /** @copydoc sz_hashes */
  813. SZ_PUBLIC void sz_hashes_serial(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
  814. sz_hash_callback_t callback, void *callback_handle);
  815. typedef void (*sz_hashes_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_size_t, sz_hash_callback_t, void *);
  816. /**
  817. * @brief Computes the Karp-Rabin rolling hashes of a string outputting a binary fingerprint.
  818. * Such fingerprints can be compared with Hamming or Jaccard (Tanimoto) distance for similarity.
  819. *
  820. * The algorithm doesn't clear the fingerprint buffer on start, so it can be invoked multiple times
  821. * to produce a fingerprint of a longer string, by passing the previous fingerprint as the ::fingerprint.
  822. * It can also be reused to produce multi-resolution fingerprints by changing the ::window_length
  823. * and calling the same function multiple times for the same input ::text.
  824. *
  825. * Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
  826. * avoiding cache-coherency penalties of remote on-heap buffers.
  827. *
  828. * @param text String to hash.
  829. * @param length Number of bytes in the string.
  830. * @param fingerprint Output fingerprint buffer.
  831. * @param fingerprint_bytes Number of bytes in the fingerprint buffer.
  832. * @param window_length Length of the rolling window in bytes.
  833. * @see sz_hashes, sz_hashes_intersection
  834. */
  835. SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
  836. sz_ptr_t fingerprint, sz_size_t fingerprint_bytes);
  837. typedef void (*sz_hashes_fingerprint_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_ptr_t, sz_size_t);
  838. /**
  839. * @brief Given a hash-fingerprint of a textual document, computes the number of intersecting hashes
  840. * of the incoming document. Can be used for document scoring and search.
  841. *
  842. * Processes large strings in parts to maximize the cache utilization, using a small on-stack buffer,
  843. * avoiding cache-coherency penalties of remote on-heap buffers.
  844. *
  845. * @param text Input document.
  846. * @param length Number of bytes in the input document.
  847. * @param fingerprint Reference document fingerprint.
  848. * @param fingerprint_bytes Number of bytes in the reference documents fingerprint.
  849. * @param window_length Length of the rolling window in bytes.
  850. * @see sz_hashes, sz_hashes_fingerprint
  851. */
  852. SZ_PUBLIC sz_size_t sz_hashes_intersection(sz_cptr_t text, sz_size_t length, sz_size_t window_length, //
  853. sz_cptr_t fingerprint, sz_size_t fingerprint_bytes);
  854. typedef sz_size_t (*sz_hashes_intersection_t)(sz_cptr_t, sz_size_t, sz_size_t, sz_cptr_t, sz_size_t);
  855. #pragma endregion
  856. #pragma region Convenience API
  857. /**
  858. * @brief Finds the first character in the haystack, that is present in the needle.
  859. * Convenience function, reused across different language bindings.
  860. * @see sz_find_charset
  861. */
  862. SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
  863. /**
  864. * @brief Finds the first character in the haystack, that is @b not present in the needle.
  865. * Convenience function, reused across different language bindings.
  866. * @see sz_find_charset
  867. */
  868. SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
  869. /**
  870. * @brief Finds the last character in the haystack, that is present in the needle.
  871. * Convenience function, reused across different language bindings.
  872. * @see sz_find_charset
  873. */
  874. SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
  875. /**
  876. * @brief Finds the last character in the haystack, that is @b not present in the needle.
  877. * Convenience function, reused across different language bindings.
  878. * @see sz_find_charset
  879. */
  880. SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length);
  881. #pragma endregion
  882. #pragma region String Sequences API
  883. struct sz_sequence_t;
  884. typedef sz_cptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
  885. typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
  886. typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
  887. typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
  888. typedef sz_bool_t (*sz_string_is_less_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_size_t);
  889. typedef struct sz_sequence_t {
  890. sz_sorted_idx_t *order;
  891. sz_size_t count;
  892. sz_sequence_member_start_t get_start;
  893. sz_sequence_member_length_t get_length;
  894. void const *handle;
  895. } sz_sequence_t;
  896. /**
  897. * @brief Initiates the sequence structure from a tape layout, used by Apache Arrow.
  898. * Expects ::offsets to contains `count + 1` entries, the last pointing at the end
  899. * of the last string, indicating the total length of the ::tape.
  900. */
  901. SZ_PUBLIC void sz_sequence_from_u32tape(sz_cptr_t *start, sz_u32_t const *offsets, sz_size_t count,
  902. sz_sequence_t *sequence);
  903. /**
  904. * @brief Initiates the sequence structure from a tape layout, used by Apache Arrow.
  905. * Expects ::offsets to contains `count + 1` entries, the last pointing at the end
  906. * of the last string, indicating the total length of the ::tape.
  907. */
  908. SZ_PUBLIC void sz_sequence_from_u64tape(sz_cptr_t *start, sz_u64_t const *offsets, sz_size_t count,
  909. sz_sequence_t *sequence);
  910. /**
  911. * @brief Similar to `std::partition`, given a predicate splits the sequence into two parts.
  912. * The algorithm is unstable, meaning that elements may change relative order, as long
  913. * as they are in the right partition. This is the simpler algorithm for partitioning.
  914. */
  915. SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate);
  916. /**
  917. * @brief Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
  918. *
  919. * @param partition The number of elements in the first sub-sequence in `sequence`.
  920. * @param less Comparison function, to determine the lexicographic ordering.
  921. */
  922. SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less);
  923. /**
  924. * @brief Sorting algorithm, combining Radix Sort for the first 32 bits of every word
  925. * and a follow-up by a more conventional sorting procedure on equally prefixed parts.
  926. */
  927. SZ_PUBLIC void sz_sort(sz_sequence_t *sequence);
  928. /**
  929. * @brief Partial sorting algorithm, combining Radix Sort for the first 32 bits of every word
  930. * and a follow-up by a more conventional sorting procedure on equally prefixed parts.
  931. */
  932. SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t n);
  933. /**
  934. * @brief Intro-Sort algorithm that supports custom comparators.
  935. */
  936. SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t less);
  937. #pragma endregion
  938. /*
  939. * Hardware feature detection.
  940. * All of those can be controlled by the user.
  941. */
  942. #ifndef SZ_USE_X86_AVX512
  943. #ifdef __AVX512BW__
  944. #define SZ_USE_X86_AVX512 1
  945. #else
  946. #define SZ_USE_X86_AVX512 0
  947. #endif
  948. #endif
  949. #ifndef SZ_USE_X86_AVX2
  950. #ifdef __AVX2__
  951. #define SZ_USE_X86_AVX2 1
  952. #else
  953. #define SZ_USE_X86_AVX2 0
  954. #endif
  955. #endif
  956. #ifndef SZ_USE_ARM_NEON
  957. #ifdef __ARM_NEON
  958. #define SZ_USE_ARM_NEON 1
  959. #else
  960. #define SZ_USE_ARM_NEON 0
  961. #endif
  962. #endif
  963. #ifndef SZ_USE_ARM_SVE
  964. #ifdef __ARM_FEATURE_SVE
  965. #define SZ_USE_ARM_SVE 1
  966. #else
  967. #define SZ_USE_ARM_SVE 0
  968. #endif
  969. #endif
  970. /*
  971. * Include hardware-specific headers.
  972. */
  973. #if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2
  974. #include <immintrin.h>
  975. #endif // SZ_USE_X86...
  976. #if SZ_USE_ARM_NEON
  977. #include <arm_acle.h>
  978. #include <arm_neon.h>
  979. #endif // SZ_USE_ARM_NEON
  980. #if SZ_USE_ARM_SVE
  981. #include <arm_sve.h>
  982. #endif // SZ_USE_ARM_SVE
  983. #pragma region Hardware-Specific API
  984. #if SZ_USE_X86_AVX512
  985. /** @copydoc sz_equal_serial */
  986. SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
  987. /** @copydoc sz_order_serial */
  988. SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
  989. /** @copydoc sz_copy_serial */
  990. SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  991. /** @copydoc sz_move_serial */
  992. SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  993. /** @copydoc sz_fill_serial */
  994. SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
  995. /** @copydoc sz_find_byte */
  996. SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  997. /** @copydoc sz_rfind_byte */
  998. SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  999. /** @copydoc sz_find */
  1000. SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  1001. /** @copydoc sz_rfind */
  1002. SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  1003. /** @copydoc sz_find_charset */
  1004. SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
  1005. /** @copydoc sz_rfind_charset */
  1006. SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
  1007. /** @copydoc sz_edit_distance */
  1008. SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
  1009. sz_size_t bound, sz_memory_allocator_t *alloc);
  1010. /** @copydoc sz_alignment_score */
  1011. SZ_PUBLIC sz_ssize_t sz_alignment_score_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
  1012. sz_error_cost_t const *subs, sz_error_cost_t gap, //
  1013. sz_memory_allocator_t *alloc);
  1014. /** @copydoc sz_hashes */
  1015. SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
  1016. sz_hash_callback_t callback, void *callback_handle);
  1017. #endif
  1018. #if SZ_USE_X86_AVX2
  1019. /** @copydoc sz_equal */
  1020. SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  1021. /** @copydoc sz_move */
  1022. SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
  1023. /** @copydoc sz_fill */
  1024. SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value);
  1025. /** @copydoc sz_find_byte */
  1026. SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  1027. /** @copydoc sz_rfind_byte */
  1028. SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  1029. /** @copydoc sz_find */
  1030. SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  1031. /** @copydoc sz_rfind */
  1032. SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  1033. /** @copydoc sz_hashes */
  1034. SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t step, //
  1035. sz_hash_callback_t callback, void *callback_handle);
  1036. #endif
  1037. #if SZ_USE_ARM_NEON
  1038. /** @copydoc sz_equal */
  1039. SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length);
  1040. /** @copydoc sz_find_byte */
  1041. SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  1042. /** @copydoc sz_rfind_byte */
  1043. SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
  1044. /** @copydoc sz_find */
  1045. SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  1046. /** @copydoc sz_rfind */
  1047. SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length);
  1048. /** @copydoc sz_find_charset */
  1049. SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
  1050. /** @copydoc sz_rfind_charset */
  1051. SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_charset_t const *set);
  1052. #endif
  1053. #pragma endregion
  1054. #pragma GCC diagnostic push
  1055. #pragma GCC diagnostic ignored "-Wconversion"
  1056. /*
  1057. **********************************************************************************************************************
  1058. **********************************************************************************************************************
  1059. **********************************************************************************************************************
  1060. *
  1061. * This is where we the actual implementation begins.
  1062. * The rest of the file is hidden from the public API.
  1063. *
  1064. **********************************************************************************************************************
  1065. **********************************************************************************************************************
  1066. **********************************************************************************************************************
  1067. */
  1068. #pragma region Compiler Extensions and Helper Functions
  1069. #pragma GCC visibility push(hidden)
  1070. /**
  1071. * @brief Helper-macro to mark potentially unused variables.
  1072. */
  1073. #define sz_unused(x) ((void)(x))
  1074. /**
  1075. * @brief Helper-macro casting a variable to another type of the same size.
  1076. */
  1077. #define sz_bitcast(type, value) (*((type *)&(value)))
  1078. /**
  1079. * @brief Defines `SZ_NULL`, analogous to `NULL`.
  1080. * The default often comes from locale.h, stddef.h,
  1081. * stdio.h, stdlib.h, string.h, time.h, or wchar.h.
  1082. */
  1083. #ifdef __GNUG__
  1084. #define SZ_NULL __null
  1085. #define SZ_NULL_CHAR __null
  1086. #else
  1087. #define SZ_NULL ((void *)0)
  1088. #define SZ_NULL_CHAR ((char *)0)
  1089. #endif
  1090. /**
  1091. * @brief Cache-line width, that will affect the execution of some algorithms,
  1092. * like equality checks and relative order computing.
  1093. */
  1094. #define SZ_CACHE_LINE_WIDTH (64) // bytes
  1095. /**
  1096. * @brief Similar to `assert`, the `sz_assert` is used in the SZ_DEBUG mode
  1097. * to check the invariants of the library. It's a no-op in the SZ_RELEASE mode.
  1098. * @note If you want to catch it, put a breakpoint at @b `__GI_exit`
  1099. */
  1100. #if SZ_DEBUG
  1101. #include <stdio.h> // `fprintf`
  1102. #include <stdlib.h> // `EXIT_FAILURE`
  1103. #define sz_assert(condition) \
  1104. do { \
  1105. if (!(condition)) { \
  1106. fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", #condition, __FILE__, __LINE__); \
  1107. exit(EXIT_FAILURE); \
  1108. } \
  1109. } while (0)
  1110. #else
  1111. #define sz_assert(condition) ((void)0)
  1112. #endif
  1113. /* Intrinsics aliases for MSVC, GCC, Clang, and Clang-Cl.
  1114. * The following section of compiler intrinsics comes in 2 flavors.
  1115. */
  1116. #if defined(_MSC_VER) && !defined(__clang__) // On Clang-CL
  1117. #include <intrin.h>
  1118. // Sadly, when building Win32 images, we can't use the `_tzcnt_u64`, `_lzcnt_u64`,
  1119. // `_BitScanForward64`, or `_BitScanReverse64` intrinsics. For now it's a simple `for`-loop.
  1120. // In the future we can switch to a more efficient De Bruijn's algorithm.
  1121. // https://www.chessprogramming.org/BitScan
  1122. // https://www.chessprogramming.org/De_Bruijn_Sequence
  1123. // https://gist.github.com/resilar/e722d4600dbec9752771ab4c9d47044f
  1124. //
  1125. // Use the serial version on 32-bit x86 and on Arm.
  1126. #if (defined(_WIN32) && !defined(_WIN64)) || defined(_M_ARM) || defined(_M_ARM64)
  1127. SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) {
  1128. sz_assert(x != 0);
  1129. int n = 0;
  1130. while ((x & 1) == 0) { n++, x >>= 1; }
  1131. return n;
  1132. }
  1133. SZ_INTERNAL int sz_u64_clz(sz_u64_t x) {
  1134. sz_assert(x != 0);
  1135. int n = 0;
  1136. while ((x & 0x8000000000000000ULL) == 0) { n++, x <<= 1; }
  1137. return n;
  1138. }
  1139. SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) {
  1140. x = x - ((x >> 1) & 0x5555555555555555);
  1141. x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
  1142. return (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0F) * 0x0101010101010101) >> 56;
  1143. }
  1144. SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) {
  1145. x = x - ((x >> 1) & 0x55555555);
  1146. x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
  1147. return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
  1148. }
  1149. #else
  1150. SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return _tzcnt_u64(x); }
  1151. SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return _lzcnt_u64(x); }
  1152. SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __popcnt64(x); }
  1153. SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __popcnt(x); }
  1154. #endif
  1155. SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return _tzcnt_u32(x); }
  1156. SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return _lzcnt_u32(x); }
  1157. SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); }
  1158. SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); }
  1159. #else
  1160. SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); }
  1161. SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return __builtin_popcount(x); }
  1162. SZ_INTERNAL int sz_u64_ctz(sz_u64_t x) { return __builtin_ctzll(x); }
  1163. SZ_INTERNAL int sz_u64_clz(sz_u64_t x) { return __builtin_clzll(x); }
  1164. SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return __builtin_ctz(x); } // ! Undefined if `x == 0`
  1165. SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return __builtin_clz(x); } // ! Undefined if `x == 0`
  1166. SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap64(val); }
  1167. SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
  1168. #endif
  1169. SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
  1170. /**
  1171. * @brief Select bits from either ::a or ::b depending on the value of ::mask bits.
  1172. *
  1173. * Similar to `_mm_blend_epi16` intrinsic on x86.
  1174. * Described in the "Bit Twiddling Hacks" by Sean Eron Anderson.
  1175. * https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
  1176. */
  1177. SZ_INTERNAL sz_u64_t sz_u64_blend(sz_u64_t a, sz_u64_t b, sz_u64_t mask) { return a ^ ((a ^ b) & mask); }
  1178. /*
  1179. * Efficiently computing the minimum and maximum of two or three values can be tricky.
  1180. * The simple branching baseline would be:
  1181. *
  1182. * x < y ? x : y // can replace with 1 conditional move
  1183. *
  1184. * Branchless approach is well known for signed integers, but it doesn't apply to unsigned ones.
  1185. * https://stackoverflow.com/questions/514435/templatized-branchless-int-max-min-function
  1186. * https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
  1187. * Using only bit-shifts for singed integers it would be:
  1188. *
  1189. * y + ((x - y) & (x - y) >> 31) // 4 unique operations
  1190. *
  1191. * Alternatively, for any integers using multiplication:
  1192. *
  1193. * (x > y) * y + (x <= y) * x // 5 operations
  1194. *
  1195. * Alternatively, to avoid multiplication:
  1196. *
  1197. * x & ~((x < y) - 1) + y & ((x < y) - 1) // 6 unique operations
  1198. */
  1199. #define sz_min_of_two(x, y) (x < y ? x : y)
  1200. #define sz_max_of_two(x, y) (x < y ? y : x)
  1201. #define sz_min_of_three(x, y, z) sz_min_of_two(x, sz_min_of_two(y, z))
  1202. #define sz_max_of_three(x, y, z) sz_max_of_two(x, sz_max_of_two(y, z))
  1203. /** @brief Branchless minimum function for two signed 32-bit integers. */
  1204. SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x - y) & (x - y) >> 31); }
  1205. /** @brief Branchless minimum function for two signed 32-bit integers. */
  1206. SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
  1207. /**
  1208. * @brief Clamps signed offsets in a string to a valid range. Used for Pythonic-style slicing.
  1209. */
  1210. SZ_INTERNAL void sz_ssize_clamp_interval(sz_size_t length, sz_ssize_t start, sz_ssize_t end,
  1211. sz_size_t *normalized_offset, sz_size_t *normalized_length) {
  1212. // TODO: Remove branches.
  1213. // Normalize negative indices
  1214. if (start < 0) start += length;
  1215. if (end < 0) end += length;
  1216. // Clamp indices to a valid range
  1217. if (start < 0) start = 0;
  1218. if (end < 0) end = 0;
  1219. if (start > (sz_ssize_t)length) start = length;
  1220. if (end > (sz_ssize_t)length) end = length;
  1221. // Ensure start <= end
  1222. if (start > end) start = end;
  1223. *normalized_offset = start;
  1224. *normalized_length = end - start;
  1225. }
  1226. /**
  1227. * @brief Compute the logarithm base 2 of a positive integer, rounding down.
  1228. */
  1229. SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
  1230. sz_assert(x > 0 && "Non-positive numbers have no defined logarithm");
  1231. sz_size_t leading_zeros = sz_u64_clz(x);
  1232. return 63 - leading_zeros;
  1233. }
  1234. /**
  1235. * @brief Compute the smallest power of two greater than or equal to ::x.
  1236. */
  1237. SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
  1238. // Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`.
  1239. // https://stackoverflow.com/a/10143264
  1240. x--;
  1241. x |= x >> 1;
  1242. x |= x >> 2;
  1243. x |= x >> 4;
  1244. x |= x >> 8;
  1245. x |= x >> 16;
  1246. #if SZ_DETECT_64_BIT
  1247. x |= x >> 32;
  1248. #endif
  1249. x++;
  1250. return x;
  1251. }
  1252. /**
  1253. * @brief Transposes an 8x8 bit matrix packed in a `sz_u64_t`.
  1254. *
  1255. * There is a well known SWAR sequence for that known to chess programmers,
  1256. * willing to flip a bit-matrix of pieces along the main A1-H8 diagonal.
  1257. * https://www.chessprogramming.org/Flipping_Mirroring_and_Rotating
  1258. * https://lukas-prokop.at/articles/2021-07-23-transpose
  1259. */
  1260. SZ_INTERNAL sz_u64_t sz_u64_transpose(sz_u64_t x) {
  1261. sz_u64_t t;
  1262. t = x ^ (x << 36);
  1263. x ^= 0xf0f0f0f00f0f0f0full & (t ^ (x >> 36));
  1264. t = 0xcccc0000cccc0000ull & (x ^ (x << 18));
  1265. x ^= t ^ (t >> 18);
  1266. t = 0xaa00aa00aa00aa00ull & (x ^ (x << 9));
  1267. x ^= t ^ (t >> 9);
  1268. return x;
  1269. }
  1270. /**
  1271. * @brief Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
  1272. */
  1273. SZ_INTERNAL void sz_u64_swap(sz_u64_t *a, sz_u64_t *b) {
  1274. sz_u64_t t = *a;
  1275. *a = *b;
  1276. *b = t;
  1277. }
  1278. /**
  1279. * @brief Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
  1280. */
  1281. SZ_INTERNAL void sz_pointer_swap(void **a, void **b) {
  1282. void *t = *a;
  1283. *a = *b;
  1284. *b = t;
  1285. }
  1286. /**
  1287. * @brief Helper structure to simplify work with 16-bit words.
  1288. * @see sz_u16_load
  1289. */
  1290. typedef union sz_u16_vec_t {
  1291. sz_u16_t u16;
  1292. sz_u8_t u8s[2];
  1293. } sz_u16_vec_t;
  1294. /**
  1295. * @brief Load a 16-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
  1296. */
  1297. SZ_INTERNAL sz_u16_vec_t sz_u16_load(sz_cptr_t ptr) {
  1298. #if !SZ_USE_MISALIGNED_LOADS
  1299. sz_u16_vec_t result;
  1300. result.u8s[0] = ptr[0];
  1301. result.u8s[1] = ptr[1];
  1302. return result;
  1303. #elif defined(_MSC_VER) && !defined(__clang__)
  1304. #if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
  1305. return *((sz_u16_vec_t *)ptr);
  1306. #else
  1307. return *((__unaligned sz_u16_vec_t *)ptr);
  1308. #endif
  1309. #else
  1310. __attribute__((aligned(1))) sz_u16_vec_t const *result = (sz_u16_vec_t const *)ptr;
  1311. return *result;
  1312. #endif
  1313. }
  1314. /**
  1315. * @brief Helper structure to simplify work with 32-bit words.
  1316. * @see sz_u32_load
  1317. */
  1318. typedef union sz_u32_vec_t {
  1319. sz_u32_t u32;
  1320. sz_u16_t u16s[2];
  1321. sz_u8_t u8s[4];
  1322. } sz_u32_vec_t;
  1323. /**
  1324. * @brief Load a 32-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
  1325. */
  1326. SZ_INTERNAL sz_u32_vec_t sz_u32_load(sz_cptr_t ptr) {
  1327. #if !SZ_USE_MISALIGNED_LOADS
  1328. sz_u32_vec_t result;
  1329. result.u8s[0] = ptr[0];
  1330. result.u8s[1] = ptr[1];
  1331. result.u8s[2] = ptr[2];
  1332. result.u8s[3] = ptr[3];
  1333. return result;
  1334. #elif defined(_MSC_VER) && !defined(__clang__)
  1335. #if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
  1336. return *((sz_u32_vec_t *)ptr);
  1337. #else
  1338. return *((__unaligned sz_u32_vec_t *)ptr);
  1339. #endif
  1340. #else
  1341. __attribute__((aligned(1))) sz_u32_vec_t const *result = (sz_u32_vec_t const *)ptr;
  1342. return *result;
  1343. #endif
  1344. }
  1345. /**
  1346. * @brief Helper structure to simplify work with 64-bit words.
  1347. * @see sz_u64_load
  1348. */
  1349. typedef union sz_u64_vec_t {
  1350. sz_u64_t u64;
  1351. sz_u32_t u32s[2];
  1352. sz_u16_t u16s[4];
  1353. sz_u8_t u8s[8];
  1354. } sz_u64_vec_t;
  1355. /**
  1356. * @brief Load a 64-bit unsigned integer from a potentially unaligned pointer, can be expensive on some platforms.
  1357. */
  1358. SZ_INTERNAL sz_u64_vec_t sz_u64_load(sz_cptr_t ptr) {
  1359. #if !SZ_USE_MISALIGNED_LOADS
  1360. sz_u64_vec_t result;
  1361. result.u8s[0] = ptr[0];
  1362. result.u8s[1] = ptr[1];
  1363. result.u8s[2] = ptr[2];
  1364. result.u8s[3] = ptr[3];
  1365. result.u8s[4] = ptr[4];
  1366. result.u8s[5] = ptr[5];
  1367. result.u8s[6] = ptr[6];
  1368. result.u8s[7] = ptr[7];
  1369. return result;
  1370. #elif defined(_MSC_VER) && !defined(__clang__)
  1371. #if defined(_M_IX86) //< The __unaligned modifier isn't valid for the x86 platform.
  1372. return *((sz_u64_vec_t *)ptr);
  1373. #else
  1374. return *((__unaligned sz_u64_vec_t *)ptr);
  1375. #endif
  1376. #else
  1377. __attribute__((aligned(1))) sz_u64_vec_t const *result = (sz_u64_vec_t const *)ptr;
  1378. return *result;
  1379. #endif
  1380. }
  1381. /** @brief Helper function, using the supplied fixed-capacity buffer to allocate memory. */
  1382. SZ_INTERNAL sz_ptr_t _sz_memory_allocate_fixed(sz_size_t length, void *handle) {
  1383. sz_size_t capacity;
  1384. sz_copy((sz_ptr_t)&capacity, (sz_cptr_t)handle, sizeof(sz_size_t));
  1385. sz_size_t consumed_capacity = sizeof(sz_size_t);
  1386. if (consumed_capacity + length > capacity) return SZ_NULL_CHAR;
  1387. return (sz_ptr_t)handle + consumed_capacity;
  1388. }
  1389. /** @brief Helper "no-op" function, simulating memory deallocation when we use a "static" memory buffer. */
  1390. SZ_INTERNAL void _sz_memory_free_fixed(sz_ptr_t start, sz_size_t length, void *handle) {
  1391. sz_unused(start && length && handle);
  1392. }
  1393. /** @brief An internal callback used to set a bit in a power-of-two length binary fingerprint of a string. */
  1394. SZ_INTERNAL void _sz_hashes_fingerprint_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash, void *handle) {
  1395. sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
  1396. sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
  1397. sz_size_t fingerprint_bytes = fingerprint_buffer->length;
  1398. fingerprint_u8s[(hash / 8) & (fingerprint_bytes - 1)] |= (1 << (hash & 7));
  1399. sz_unused(start && length);
  1400. }
  1401. /** @brief An internal callback used to set a bit in a @b non power-of-two length binary fingerprint of a string. */
  1402. SZ_INTERNAL void _sz_hashes_fingerprint_non_pow2_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
  1403. void *handle) {
  1404. sz_string_view_t *fingerprint_buffer = (sz_string_view_t *)handle;
  1405. sz_u8_t *fingerprint_u8s = (sz_u8_t *)fingerprint_buffer->start;
  1406. sz_size_t fingerprint_bytes = fingerprint_buffer->length;
  1407. fingerprint_u8s[(hash / 8) % fingerprint_bytes] |= (1 << (hash & 7));
  1408. sz_unused(start && length);
  1409. }
  1410. /** @brief An internal callback, used to mix all the running hashes into one pointer-size value. */
  1411. SZ_INTERNAL void _sz_hashes_fingerprint_scalar_callback(sz_cptr_t start, sz_size_t length, sz_u64_t hash,
  1412. void *scalar_handle) {
  1413. sz_unused(start && length && hash && scalar_handle);
  1414. sz_size_t *scalar_ptr = (sz_size_t *)scalar_handle;
  1415. *scalar_ptr ^= hash;
  1416. }
  1417. /**
  1418. * @brief Chooses the offsets of the most interesting characters in a search needle.
  1419. *
  1420. * Search throughput can significantly deteriorate if we are matching the wrong characters.
  1421. * Say the needle is "aXaYa", and we are comparing the first, second, and last character.
  1422. * If we use SIMD and compare many offsets at a time, comparing against "a" in every register is a waste.
  1423. *
  1424. * Similarly, dealing with UTF8 inputs, we know that the lower bits of each character code carry more information.
  1425. * Cyrillic alphabet, for example, falls into [0x0410, 0x042F] code range for uppercase [А, Я], and
  1426. * into [0x0430, 0x044F] for lowercase [а, я]. Scanning through a text written in Russian, half of the
  1427. * bytes will carry absolutely no value and will be equal to 0x04.
  1428. */
  1429. SZ_INTERNAL void _sz_locate_needle_anomalies(sz_cptr_t start, sz_size_t length, //
  1430. sz_size_t *first, sz_size_t *second, sz_size_t *third) {
  1431. *first = 0;
  1432. *second = length / 2;
  1433. *third = length - 1;
  1434. //
  1435. int has_duplicates = //
  1436. start[*first] == start[*second] || //
  1437. start[*first] == start[*third] || //
  1438. start[*second] == start[*third];
  1439. // Loop through letters to find non-colliding variants.
  1440. if (length > 3 && has_duplicates) {
  1441. // Pivot the middle point right, until we find a character different from the first one.
  1442. for (; start[*second] == start[*first] && *second + 1 < *third; ++(*second)) {}
  1443. // Pivot the third (last) point left, until we find a different character.
  1444. for (; (start[*third] == start[*second] || start[*third] == start[*first]) && *third > (*second + 1);
  1445. --(*third)) {}
  1446. }
  1447. // TODO: Investigate alternative strategies for long needles.
  1448. // On very long needles we have the luxury to choose!
  1449. // Often dealing with UTF8, we will likely benfit from shifting the first and second characters
  1450. // further to the right, to achieve not only uniqness within the needle, but also avoid common
  1451. // rune prefixes of 2-, 3-, and 4-byte codes.
  1452. if (length > 8) {
  1453. // Pivot the first and second points right, until we find a character, that:
  1454. // > is different from others.
  1455. // > doesn't start with 0b'110x'xxxx - only 5 bits of relevant info.
  1456. // > doesn't start with 0b'1110'xxxx - only 4 bits of relevant info.
  1457. // > doesn't start with 0b'1111'0xxx - only 3 bits of relevant info.
  1458. //
  1459. // So we are practically searching for byte values that start with 0b0xxx'xxxx or 0b'10xx'xxxx.
  1460. // Meaning they fall in the range [0, 127] and [128, 191], in other words any unsigned int up to 191.
  1461. sz_u8_t const *start_u8 = (sz_u8_t const *)start;
  1462. sz_size_t vibrant_first = *first, vibrant_second = *second, vibrant_third = *third;
  1463. // Let's begin with the seccond character, as the termination criterea there is more obvious
  1464. // and we may end up with more variants to check for the first candidate.
  1465. for (; (start_u8[vibrant_second] > 191 || start_u8[vibrant_second] == start_u8[vibrant_third]) &&
  1466. (vibrant_second + 1 < vibrant_third);
  1467. ++vibrant_second) {}
  1468. // Now check if we've indeed found a good candidate or should revert the `vibrant_second` to `second`.
  1469. if (start_u8[vibrant_second] < 191) { *second = vibrant_second; }
  1470. else { vibrant_second = *second; }
  1471. // Now check the first character.
  1472. for (; (start_u8[vibrant_first] > 191 || start_u8[vibrant_first] == start_u8[vibrant_second] ||
  1473. start_u8[vibrant_first] == start_u8[vibrant_third]) &&
  1474. (vibrant_first + 1 < vibrant_second);
  1475. ++vibrant_first) {}
  1476. // Now check if we've indeed found a good candidate or should revert the `vibrant_first` to `first`.
  1477. // We don't need to shift the third one when dealing with texts as the last byte of the text is
  1478. // also the last byte of a rune and contains the most information.
  1479. if (start_u8[vibrant_first] < 191) { *first = vibrant_first; }
  1480. }
  1481. }
  1482. #pragma GCC visibility pop
  1483. #pragma endregion
  1484. #pragma region Serial Implementation
  1485. #if !SZ_AVOID_LIBC
  1486. #include <stdio.h> // `fprintf`
  1487. #include <stdlib.h> // `malloc`, `EXIT_FAILURE`
  1488. #endif
  1489. SZ_PUBLIC void sz_memory_allocator_init_default(sz_memory_allocator_t *alloc) {
  1490. #if !SZ_AVOID_LIBC
  1491. alloc->allocate = (sz_memory_allocate_t)malloc;
  1492. alloc->free = (sz_memory_free_t)free;
  1493. #else
  1494. alloc->allocate = (sz_memory_allocate_t)SZ_NULL;
  1495. alloc->free = (sz_memory_free_t)SZ_NULL;
  1496. #endif
  1497. alloc->handle = SZ_NULL;
  1498. }
  1499. SZ_PUBLIC void sz_memory_allocator_init_fixed(sz_memory_allocator_t *alloc, void *buffer, sz_size_t length) {
  1500. // The logic here is simple - put the buffer length in the first slots of the buffer.
  1501. // Later use it for bounds checking.
  1502. alloc->allocate = (sz_memory_allocate_t)_sz_memory_allocate_fixed;
  1503. alloc->free = (sz_memory_free_t)_sz_memory_free_fixed;
  1504. alloc->handle = &buffer;
  1505. sz_copy((sz_ptr_t)buffer, (sz_cptr_t)&length, sizeof(sz_size_t));
  1506. }
  1507. /**
  1508. * @brief Byte-level equality comparison between two strings.
  1509. * If unaligned loads are allowed, uses a switch-table to avoid loops on short strings.
  1510. */
  1511. SZ_PUBLIC sz_bool_t sz_equal_serial(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
  1512. sz_cptr_t const a_end = a + length;
  1513. #if SZ_USE_MISALIGNED_LOADS
  1514. if (length >= SZ_SWAR_THRESHOLD) {
  1515. sz_u64_vec_t a_vec, b_vec;
  1516. for (; a + 8 <= a_end; a += 8, b += 8) {
  1517. a_vec = sz_u64_load(a);
  1518. b_vec = sz_u64_load(b);
  1519. if (a_vec.u64 != b_vec.u64) return sz_false_k;
  1520. }
  1521. }
  1522. #endif
  1523. while (a != a_end && *a == *b) a++, b++;
  1524. return (sz_bool_t)(a_end == a);
  1525. }
  1526. SZ_PUBLIC sz_cptr_t sz_find_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
  1527. for (sz_cptr_t const end = text + length; text != end; ++text)
  1528. if (sz_charset_contains(set, *text)) return text;
  1529. return SZ_NULL_CHAR;
  1530. }
  1531. SZ_PUBLIC sz_cptr_t sz_rfind_charset_serial(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
  1532. #pragma GCC diagnostic push
  1533. #pragma GCC diagnostic ignored "-Warray-bounds"
  1534. sz_cptr_t const end = text;
  1535. for (text += length; text != end;)
  1536. if (sz_charset_contains(set, *(text -= 1))) return text;
  1537. return SZ_NULL_CHAR;
  1538. #pragma GCC diagnostic pop
  1539. }
  1540. /**
  1541. * One option to avoid branching is to use conditional moves and lookup the comparison result in a table:
  1542. * sz_ordering_t ordering_lookup[2] = {sz_greater_k, sz_less_k};
  1543. * for (; a != min_end; ++a, ++b)
  1544. * if (*a != *b) return ordering_lookup[*a < *b];
  1545. * That, however, introduces a data-dependency.
  1546. * A cleaner option is to perform two comparisons and a subtraction.
  1547. * One instruction more, but no data-dependency.
  1548. */
  1549. #define _sz_order_scalars(a, b) ((sz_ordering_t)((a > b) - (a < b)))
  1550. SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
  1551. sz_bool_t a_shorter = (sz_bool_t)(a_length < b_length);
  1552. sz_size_t min_length = a_shorter ? a_length : b_length;
  1553. sz_cptr_t min_end = a + min_length;
  1554. #if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
  1555. for (sz_u64_vec_t a_vec, b_vec; a + 8 <= min_end; a += 8, b += 8) {
  1556. a_vec = sz_u64_load(a);
  1557. b_vec = sz_u64_load(b);
  1558. if (a_vec.u64 != b_vec.u64)
  1559. return _sz_order_scalars(sz_u64_bytes_reverse(a_vec.u64), sz_u64_bytes_reverse(b_vec.u64));
  1560. }
  1561. #endif
  1562. for (; a != min_end; ++a, ++b)
  1563. if (*a != *b) return _sz_order_scalars(*a, *b);
  1564. // If the strings are equal up to `min_end`, then the shorter string is smaller
  1565. return _sz_order_scalars(a_length, b_length);
  1566. }
  1567. /**
  1568. * @brief Byte-level equality comparison between two 64-bit integers.
  1569. * @return 64-bit integer, where every top bit in each byte signifies a match.
  1570. */
  1571. SZ_INTERNAL sz_u64_vec_t _sz_u64_each_byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
  1572. sz_u64_vec_t vec;
  1573. vec.u64 = ~(a.u64 ^ b.u64);
  1574. // The match is valid, if every bit within each byte is set.
  1575. // For that take the bottom 7 bits of each byte, add one to them,
  1576. // and if this sets the top bit to one, then all the 7 bits are ones as well.
  1577. vec.u64 = ((vec.u64 & 0x7F7F7F7F7F7F7F7Full) + 0x0101010101010101ull) & ((vec.u64 & 0x8080808080808080ull));
  1578. return vec;
  1579. }
  1580. /**
  1581. * @brief Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
  1582. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  1583. * Identical to `memchr(haystack, needle[0], haystack_length)`.
  1584. */
  1585. SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  1586. if (!h_length) return SZ_NULL_CHAR;
  1587. sz_cptr_t const h_end = h + h_length;
  1588. #if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevety.
  1589. #if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
  1590. for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
  1591. if (*h == *n) return h;
  1592. #endif
  1593. // Broadcast the n into every byte of a 64-bit integer to use SWAR
  1594. // techniques and process eight characters at a time.
  1595. sz_u64_vec_t h_vec, n_vec, match_vec;
  1596. match_vec.u64 = 0;
  1597. n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
  1598. for (; h + 8 <= h_end; h += 8) {
  1599. h_vec.u64 = *(sz_u64_t const *)h;
  1600. match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
  1601. if (match_vec.u64) return h + sz_u64_ctz(match_vec.u64) / 8;
  1602. }
  1603. #endif
  1604. // Handle the misaligned tail.
  1605. for (; h < h_end; ++h)
  1606. if (*h == *n) return h;
  1607. return SZ_NULL_CHAR;
  1608. }
  1609. /**
  1610. * @brief Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
  1611. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  1612. * Identical to `memrchr(haystack, needle[0], haystack_length)`.
  1613. */
  1614. sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  1615. if (!h_length) return SZ_NULL_CHAR;
  1616. sz_cptr_t const h_start = h;
  1617. // Reposition the `h` pointer to the end, as we will be walking backwards.
  1618. h = h + h_length - 1;
  1619. #if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevety.
  1620. #if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
  1621. for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
  1622. if (*h == *n) return h;
  1623. #endif
  1624. // Broadcast the n into every byte of a 64-bit integer to use SWAR
  1625. // techniques and process eight characters at a time.
  1626. sz_u64_vec_t h_vec, n_vec, match_vec;
  1627. n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
  1628. for (; h >= h_start + 7; h -= 8) {
  1629. h_vec.u64 = *(sz_u64_t const *)(h - 7);
  1630. match_vec = _sz_u64_each_byte_equal(h_vec, n_vec);
  1631. if (match_vec.u64) return h - sz_u64_clz(match_vec.u64) / 8;
  1632. }
  1633. #endif
  1634. for (; h >= h_start; --h)
  1635. if (*h == *n) return h;
  1636. return SZ_NULL_CHAR;
  1637. }
  1638. /**
  1639. * @brief 2Byte-level equality comparison between two 64-bit integers.
  1640. * @return 64-bit integer, where every top bit in each 2byte signifies a match.
  1641. */
  1642. SZ_INTERNAL sz_u64_vec_t _sz_u64_each_2byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
  1643. sz_u64_vec_t vec;
  1644. vec.u64 = ~(a.u64 ^ b.u64);
  1645. // The match is valid, if every bit within each 2byte is set.
  1646. // For that take the bottom 15 bits of each 2byte, add one to them,
  1647. // and if this sets the top bit to one, then all the 15 bits are ones as well.
  1648. vec.u64 = ((vec.u64 & 0x7FFF7FFF7FFF7FFFull) + 0x0001000100010001ull) & ((vec.u64 & 0x8000800080008000ull));
  1649. return vec;
  1650. }
  1651. /**
  1652. * @brief Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
  1653. * This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  1654. */
  1655. SZ_INTERNAL sz_cptr_t _sz_find_2byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  1656. // This is an internal method, and the haystack is guaranteed to be at least 2 bytes long.
  1657. sz_assert(h_length >= 2 && "The haystack is too short.");
  1658. sz_cptr_t const h_end = h + h_length;
  1659. #if !SZ_USE_MISALIGNED_LOADS
  1660. // Process the misaligned head, to void UB on unaligned 64-bit loads.
  1661. for (; ((sz_size_t)h & 7ull) && h + 2 <= h_end; ++h)
  1662. if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
  1663. #endif
  1664. sz_u64_vec_t h_even_vec, h_odd_vec, n_vec, matches_even_vec, matches_odd_vec;
  1665. n_vec.u64 = 0;
  1666. n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1];
  1667. n_vec.u64 *= 0x0001000100010001ull; // broadcast
  1668. // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
  1669. for (; h + 9 <= h_end; h += 8) {
  1670. h_even_vec.u64 = *(sz_u64_t *)h;
  1671. h_odd_vec.u64 = (h_even_vec.u64 >> 8) | ((sz_u64_t)h[8] << 56);
  1672. matches_even_vec = _sz_u64_each_2byte_equal(h_even_vec, n_vec);
  1673. matches_odd_vec = _sz_u64_each_2byte_equal(h_odd_vec, n_vec);
  1674. matches_even_vec.u64 >>= 8;
  1675. if (matches_even_vec.u64 + matches_odd_vec.u64) {
  1676. sz_u64_t match_indicators = matches_even_vec.u64 | matches_odd_vec.u64;
  1677. return h + sz_u64_ctz(match_indicators) / 8;
  1678. }
  1679. }
  1680. for (; h + 2 <= h_end; ++h)
  1681. if ((h[0] == n[0]) + (h[1] == n[1]) == 2) return h;
  1682. return SZ_NULL_CHAR;
  1683. }
  1684. /**
  1685. * @brief 4Byte-level equality comparison between two 64-bit integers.
  1686. * @return 64-bit integer, where every top bit in each 4byte signifies a match.
  1687. */
  1688. SZ_INTERNAL sz_u64_vec_t _sz_u64_each_4byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
  1689. sz_u64_vec_t vec;
  1690. vec.u64 = ~(a.u64 ^ b.u64);
  1691. // The match is valid, if every bit within each 4byte is set.
  1692. // For that take the bottom 31 bits of each 4byte, add one to them,
  1693. // and if this sets the top bit to one, then all the 31 bits are ones as well.
  1694. vec.u64 = ((vec.u64 & 0x7FFFFFFF7FFFFFFFull) + 0x0000000100000001ull) & ((vec.u64 & 0x8000000080000000ull));
  1695. return vec;
  1696. }
  1697. /**
  1698. * @brief Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
  1699. * This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  1700. */
  1701. SZ_INTERNAL sz_cptr_t _sz_find_4byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  1702. // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
  1703. sz_assert(h_length >= 4 && "The haystack is too short.");
  1704. sz_cptr_t const h_end = h + h_length;
  1705. #if !SZ_USE_MISALIGNED_LOADS
  1706. // Process the misaligned head, to void UB on unaligned 64-bit loads.
  1707. for (; ((sz_size_t)h & 7ull) && h + 4 <= h_end; ++h)
  1708. if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
  1709. #endif
  1710. sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, n_vec, matches0_vec, matches1_vec, matches2_vec, matches3_vec;
  1711. n_vec.u64 = 0;
  1712. n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
  1713. n_vec.u64 *= 0x0000000100000001ull; // broadcast
  1714. // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using four 64-bit words.
  1715. // We load the subsequent four-byte word as well, taking its first bytes. Think of it as a glorified prefetch :)
  1716. sz_u64_t h_page_current, h_page_next;
  1717. for (; h + sizeof(sz_u64_t) + sizeof(sz_u32_t) <= h_end; h += sizeof(sz_u64_t)) {
  1718. h_page_current = *(sz_u64_t *)h;
  1719. h_page_next = *(sz_u32_t *)(h + 8);
  1720. h0_vec.u64 = (h_page_current);
  1721. h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
  1722. h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
  1723. h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
  1724. matches0_vec = _sz_u64_each_4byte_equal(h0_vec, n_vec);
  1725. matches1_vec = _sz_u64_each_4byte_equal(h1_vec, n_vec);
  1726. matches2_vec = _sz_u64_each_4byte_equal(h2_vec, n_vec);
  1727. matches3_vec = _sz_u64_each_4byte_equal(h3_vec, n_vec);
  1728. if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64) {
  1729. matches0_vec.u64 >>= 24;
  1730. matches1_vec.u64 >>= 16;
  1731. matches2_vec.u64 >>= 8;
  1732. sz_u64_t match_indicators = matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64;
  1733. return h + sz_u64_ctz(match_indicators) / 8;
  1734. }
  1735. }
  1736. for (; h + 4 <= h_end; ++h)
  1737. if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) + (h[3] == n[3]) == 4) return h;
  1738. return SZ_NULL_CHAR;
  1739. }
  1740. /**
  1741. * @brief 3Byte-level equality comparison between two 64-bit integers.
  1742. * @return 64-bit integer, where every top bit in each 3byte signifies a match.
  1743. */
  1744. SZ_INTERNAL sz_u64_vec_t _sz_u64_each_3byte_equal(sz_u64_vec_t a, sz_u64_vec_t b) {
  1745. sz_u64_vec_t vec;
  1746. vec.u64 = ~(a.u64 ^ b.u64);
  1747. // The match is valid, if every bit within each 4byte is set.
  1748. // For that take the bottom 31 bits of each 4byte, add one to them,
  1749. // and if this sets the top bit to one, then all the 31 bits are ones as well.
  1750. vec.u64 = ((vec.u64 & 0xFFFF7FFFFF7FFFFFull) + 0x0000000001000001ull) & ((vec.u64 & 0x0000800000800000ull));
  1751. return vec;
  1752. }
  1753. /**
  1754. * @brief Find the first occurrence of a @b three-character needle in an arbitrary length haystack.
  1755. * This implementation uses hardware-agnostic SWAR technique, to process 8 possible offsets at a time.
  1756. */
  1757. SZ_INTERNAL sz_cptr_t _sz_find_3byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  1758. // This is an internal method, and the haystack is guaranteed to be at least 4 bytes long.
  1759. sz_assert(h_length >= 3 && "The haystack is too short.");
  1760. sz_cptr_t const h_end = h + h_length;
  1761. #if !SZ_USE_MISALIGNED_LOADS
  1762. // Process the misaligned head, to void UB on unaligned 64-bit loads.
  1763. for (; ((sz_size_t)h & 7ull) && h + 3 <= h_end; ++h)
  1764. if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
  1765. #endif
  1766. // We fetch 12
  1767. sz_u64_vec_t h0_vec, h1_vec, h2_vec, h3_vec, h4_vec;
  1768. sz_u64_vec_t matches0_vec, matches1_vec, matches2_vec, matches3_vec, matches4_vec;
  1769. sz_u64_vec_t n_vec;
  1770. n_vec.u64 = 0;
  1771. n_vec.u8s[0] = n[0], n_vec.u8s[1] = n[1], n_vec.u8s[2] = n[2], n_vec.u8s[3] = n[3];
  1772. n_vec.u64 *= 0x0000000001000001ull; // broadcast
  1773. // This code simulates hyper-scalar execution, analyzing 8 offsets at a time using three 64-bit words.
  1774. // We load the subsequent two-byte word as well.
  1775. sz_u64_t h_page_current, h_page_next;
  1776. for (; h + sizeof(sz_u64_t) + sizeof(sz_u16_t) <= h_end; h += sizeof(sz_u64_t)) {
  1777. h_page_current = *(sz_u64_t *)h;
  1778. h_page_next = *(sz_u16_t *)(h + 8);
  1779. h0_vec.u64 = (h_page_current);
  1780. h1_vec.u64 = (h_page_current >> 8) | (h_page_next << 56);
  1781. h2_vec.u64 = (h_page_current >> 16) | (h_page_next << 48);
  1782. h3_vec.u64 = (h_page_current >> 24) | (h_page_next << 40);
  1783. h4_vec.u64 = (h_page_current >> 32) | (h_page_next << 32);
  1784. matches0_vec = _sz_u64_each_3byte_equal(h0_vec, n_vec);
  1785. matches1_vec = _sz_u64_each_3byte_equal(h1_vec, n_vec);
  1786. matches2_vec = _sz_u64_each_3byte_equal(h2_vec, n_vec);
  1787. matches3_vec = _sz_u64_each_3byte_equal(h3_vec, n_vec);
  1788. matches4_vec = _sz_u64_each_3byte_equal(h4_vec, n_vec);
  1789. if (matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64) {
  1790. matches0_vec.u64 >>= 16;
  1791. matches1_vec.u64 >>= 8;
  1792. matches3_vec.u64 <<= 8;
  1793. matches4_vec.u64 <<= 16;
  1794. sz_u64_t match_indicators =
  1795. matches0_vec.u64 | matches1_vec.u64 | matches2_vec.u64 | matches3_vec.u64 | matches4_vec.u64;
  1796. return h + sz_u64_ctz(match_indicators) / 8;
  1797. }
  1798. }
  1799. for (; h + 3 <= h_end; ++h)
  1800. if ((h[0] == n[0]) + (h[1] == n[1]) + (h[2] == n[2]) == 3) return h;
  1801. return SZ_NULL_CHAR;
  1802. }
  1803. /**
  1804. * @brief Boyer-Moore-Horspool algorithm for exact matching of patterns up to @b 256-bytes long.
  1805. * Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
  1806. */
  1807. SZ_INTERNAL sz_cptr_t _sz_find_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
  1808. sz_cptr_t n_chars, sz_size_t n_length) {
  1809. sz_assert(n_length <= 256 && "The pattern is too long.");
  1810. // Several popular string matching algorithms are using a bad-character shift table.
  1811. // Boyer Moore: https://www-igm.univ-mlv.fr/~lecroq/string/node14.html
  1812. // Quick Search: https://www-igm.univ-mlv.fr/~lecroq/string/node19.html
  1813. // Smith: https://www-igm.univ-mlv.fr/~lecroq/string/node21.html
  1814. union {
  1815. sz_u8_t jumps[256];
  1816. sz_u64_vec_t vecs[64];
  1817. } bad_shift_table;
  1818. // Let's initialize the table using SWAR to the total length of the string.
  1819. sz_u8_t const *h = (sz_u8_t const *)h_chars;
  1820. sz_u8_t const *n = (sz_u8_t const *)n_chars;
  1821. {
  1822. sz_u64_vec_t n_length_vec;
  1823. n_length_vec.u64 = n_length;
  1824. n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
  1825. for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
  1826. for (sz_size_t i = 0; i + 1 < n_length; ++i) bad_shift_table.jumps[n[i]] = (sz_u8_t)(n_length - i - 1);
  1827. }
  1828. // Another common heuristic is to match a few characters from different parts of a string.
  1829. // Raita suggests to use the first two, the last, and the middle character of the pattern.
  1830. sz_u32_vec_t h_vec, n_vec;
  1831. // Pick the parts of the needle that are worth comparing.
  1832. sz_size_t offset_first, offset_mid, offset_last;
  1833. _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
  1834. // Broadcast those characters into an unsigned integer.
  1835. n_vec.u8s[0] = n[offset_first];
  1836. n_vec.u8s[1] = n[offset_first + 1];
  1837. n_vec.u8s[2] = n[offset_mid];
  1838. n_vec.u8s[3] = n[offset_last];
  1839. // Scan through the whole haystack, skipping the last `n_length - 1` bytes.
  1840. for (sz_size_t i = 0; i <= h_length - n_length;) {
  1841. h_vec.u8s[0] = h[i + offset_first];
  1842. h_vec.u8s[1] = h[i + offset_first + 1];
  1843. h_vec.u8s[2] = h[i + offset_mid];
  1844. h_vec.u8s[3] = h[i + offset_last];
  1845. if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
  1846. i += bad_shift_table.jumps[h[i + n_length - 1]];
  1847. }
  1848. return SZ_NULL_CHAR;
  1849. }
  1850. /**
  1851. * @brief Boyer-Moore-Horspool algorithm for @b reverse-order exact matching of patterns up to @b 256-bytes long.
  1852. * Uses the Raita heuristic to match the first two, the last, and the middle character of the pattern.
  1853. */
  1854. SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_upto_256bytes_serial(sz_cptr_t h_chars, sz_size_t h_length, //
  1855. sz_cptr_t n_chars, sz_size_t n_length) {
  1856. sz_assert(n_length <= 256 && "The pattern is too long.");
  1857. union {
  1858. sz_u8_t jumps[256];
  1859. sz_u64_vec_t vecs[64];
  1860. } bad_shift_table;
  1861. // Let's initialize the table using SWAR to the total length of the string.
  1862. sz_u8_t const *h = (sz_u8_t const *)h_chars;
  1863. sz_u8_t const *n = (sz_u8_t const *)n_chars;
  1864. {
  1865. sz_u64_vec_t n_length_vec;
  1866. n_length_vec.u64 = n_length;
  1867. n_length_vec.u64 *= 0x0101010101010101ull; // broadcast
  1868. for (sz_size_t i = 0; i != 64; ++i) bad_shift_table.vecs[i].u64 = n_length_vec.u64;
  1869. for (sz_size_t i = 0; i + 1 < n_length; ++i)
  1870. bad_shift_table.jumps[n[n_length - i - 1]] = (sz_u8_t)(n_length - i - 1);
  1871. }
  1872. // Another common heuristic is to match a few characters from different parts of a string.
  1873. // Raita suggests to use the first two, the last, and the middle character of the pattern.
  1874. sz_u32_vec_t h_vec, n_vec;
  1875. // Pick the parts of the needle that are worth comparing.
  1876. sz_size_t offset_first, offset_mid, offset_last;
  1877. _sz_locate_needle_anomalies(n_chars, n_length, &offset_first, &offset_mid, &offset_last);
  1878. // Broadcast those characters into an unsigned integer.
  1879. n_vec.u8s[0] = n[offset_first];
  1880. n_vec.u8s[1] = n[offset_first + 1];
  1881. n_vec.u8s[2] = n[offset_mid];
  1882. n_vec.u8s[3] = n[offset_last];
  1883. // Scan through the whole haystack, skipping the first `n_length - 1` bytes.
  1884. for (sz_size_t j = 0; j <= h_length - n_length;) {
  1885. sz_size_t i = h_length - n_length - j;
  1886. h_vec.u8s[0] = h[i + offset_first];
  1887. h_vec.u8s[1] = h[i + offset_first + 1];
  1888. h_vec.u8s[2] = h[i + offset_mid];
  1889. h_vec.u8s[3] = h[i + offset_last];
  1890. if (h_vec.u32 == n_vec.u32 && sz_equal((sz_cptr_t)h + i, n_chars, n_length)) return (sz_cptr_t)h + i;
  1891. j += bad_shift_table.jumps[h[i]];
  1892. }
  1893. return SZ_NULL_CHAR;
  1894. }
  1895. /**
  1896. * @brief Exact substring search helper function, that finds the first occurrence of a prefix of the needle
  1897. * using a given search function, and then verifies the remaining part of the needle.
  1898. */
  1899. SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
  1900. sz_find_t find_prefix, sz_size_t prefix_length) {
  1901. sz_size_t suffix_length = n_length - prefix_length;
  1902. while (1) {
  1903. sz_cptr_t found = find_prefix(h, h_length, n, prefix_length);
  1904. if (!found) return SZ_NULL_CHAR;
  1905. // Verify the remaining part of the needle
  1906. sz_size_t remaining = h_length - (found - h);
  1907. if (remaining < suffix_length) return SZ_NULL_CHAR;
  1908. if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found;
  1909. // Adjust the position.
  1910. h = found + 1;
  1911. h_length = remaining - 1;
  1912. }
  1913. // Unreachable, but helps silence compiler warnings:
  1914. return SZ_NULL_CHAR;
  1915. }
  1916. /**
  1917. * @brief Exact reverse-order substring search helper function, that finds the last occurrence of a suffix of the
  1918. * needle using a given search function, and then verifies the remaining part of the needle.
  1919. */
  1920. SZ_INTERNAL sz_cptr_t _sz_rfind_with_suffix(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length,
  1921. sz_find_t find_suffix, sz_size_t suffix_length) {
  1922. sz_size_t prefix_length = n_length - suffix_length;
  1923. while (1) {
  1924. sz_cptr_t found = find_suffix(h, h_length, n + prefix_length, suffix_length);
  1925. if (!found) return SZ_NULL_CHAR;
  1926. // Verify the remaining part of the needle
  1927. sz_size_t remaining = found - h;
  1928. if (remaining < prefix_length) return SZ_NULL_CHAR;
  1929. if (sz_equal(found - prefix_length, n, prefix_length)) return found - prefix_length;
  1930. // Adjust the position.
  1931. h_length = remaining - 1;
  1932. }
  1933. // Unreachable, but helps silence compiler warnings:
  1934. return SZ_NULL_CHAR;
  1935. }
  1936. SZ_INTERNAL sz_cptr_t _sz_find_over_4bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  1937. return _sz_find_with_prefix(h, h_length, n, n_length, (sz_find_t)_sz_find_4byte_serial, 4);
  1938. }
  1939. SZ_INTERNAL sz_cptr_t _sz_find_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
  1940. sz_size_t n_length) {
  1941. return _sz_find_with_prefix(h, h_length, n, n_length, _sz_find_horspool_upto_256bytes_serial, 256);
  1942. }
  1943. SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
  1944. sz_size_t n_length) {
  1945. return _sz_rfind_with_suffix(h, h_length, n, n_length, _sz_rfind_horspool_upto_256bytes_serial, 256);
  1946. }
  1947. SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  1948. // This almost never fires, but it's better to be safe than sorry.
  1949. if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
  1950. #if SZ_DETECT_BIG_ENDIAN
  1951. sz_find_t backends[] = {
  1952. (sz_find_t)sz_find_byte_serial,
  1953. (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
  1954. (sz_find_t)_sz_find_horspool_over_256bytes_serial,
  1955. };
  1956. return backends[(n_length > 1) + (n_length > 256)](h, h_length, n, n_length);
  1957. #else
  1958. sz_find_t backends[] = {
  1959. // For very short strings brute-force SWAR makes sense.
  1960. (sz_find_t)sz_find_byte_serial,
  1961. (sz_find_t)_sz_find_2byte_serial,
  1962. (sz_find_t)_sz_find_3byte_serial,
  1963. (sz_find_t)_sz_find_4byte_serial,
  1964. // To avoid constructing the skip-table, let's use the prefixed approach.
  1965. (sz_find_t)_sz_find_over_4bytes_serial,
  1966. // For longer needles - use skip tables.
  1967. (sz_find_t)_sz_find_horspool_upto_256bytes_serial,
  1968. (sz_find_t)_sz_find_horspool_over_256bytes_serial,
  1969. };
  1970. return backends[
  1971. // For very short strings brute-force SWAR makes sense.
  1972. (n_length > 1) + (n_length > 2) + (n_length > 3) +
  1973. // To avoid constructing the skip-table, let's use the prefixed approach.
  1974. (n_length > 4) +
  1975. // For longer needles - use skip tables.
  1976. (n_length > 8) + (n_length > 256)](h, h_length, n, n_length);
  1977. #endif
  1978. }
  1979. SZ_PUBLIC sz_cptr_t sz_rfind_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  1980. // This almost never fires, but it's better to be safe than sorry.
  1981. if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
  1982. sz_find_t backends[] = {
  1983. // For very short strings brute-force SWAR makes sense.
  1984. (sz_find_t)sz_rfind_byte_serial,
  1985. // TODO: implement reverse-order SWAR for 2/3/4 byte variants.
  1986. // TODO: (sz_find_t)_sz_rfind_2byte_serial,
  1987. // TODO: (sz_find_t)_sz_rfind_3byte_serial,
  1988. // TODO: (sz_find_t)_sz_rfind_4byte_serial,
  1989. // To avoid constructing the skip-table, let's use the prefixed approach.
  1990. // (sz_find_t)_sz_rfind_over_4bytes_serial,
  1991. // For longer needles - use skip tables.
  1992. (sz_find_t)_sz_rfind_horspool_upto_256bytes_serial,
  1993. (sz_find_t)_sz_rfind_horspool_over_256bytes_serial,
  1994. };
  1995. return backends[
  1996. // For very short strings brute-force SWAR makes sense.
  1997. 0 +
  1998. // To avoid constructing the skip-table, let's use the prefixed approach.
  1999. (n_length > 1) +
  2000. // For longer needles - use skip tables.
  2001. (n_length > 256)](h, h_length, n, n_length);
  2002. }
  2003. SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
  2004. sz_cptr_t shorter, sz_size_t shorter_length, //
  2005. sz_cptr_t longer, sz_size_t longer_length, //
  2006. sz_size_t bound, sz_memory_allocator_t *alloc) {
  2007. // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
  2008. sz_memory_allocator_t global_alloc;
  2009. if (!alloc) {
  2010. sz_memory_allocator_init_default(&global_alloc);
  2011. alloc = &global_alloc;
  2012. }
  2013. // TODO: Generalize to remove the following asserts!
  2014. sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
  2015. sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
  2016. sz_unused(longer_length && bound);
  2017. // We are going to store 3 diagonals of the matrix.
  2018. // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
  2019. sz_size_t n = shorter_length + 1;
  2020. sz_size_t buffer_length = sizeof(sz_size_t) * n * 3;
  2021. sz_size_t *distances = (sz_size_t *)alloc->allocate(buffer_length, alloc->handle);
  2022. if (!distances) return SZ_SIZE_MAX;
  2023. sz_size_t *previous_distances = distances;
  2024. sz_size_t *current_distances = previous_distances + n;
  2025. sz_size_t *next_distances = previous_distances + n * 2;
  2026. // Initialize the first two diagonals:
  2027. previous_distances[0] = 0;
  2028. current_distances[0] = current_distances[1] = 1;
  2029. // Progress through the upper triangle of the Levenshtein matrix.
  2030. sz_size_t next_skew_diagonal_index = 2;
  2031. for (; next_skew_diagonal_index != n; ++next_skew_diagonal_index) {
  2032. sz_size_t const next_skew_diagonal_length = next_skew_diagonal_index + 1;
  2033. for (sz_size_t i = 0; i + 2 < next_skew_diagonal_length; ++i) {
  2034. sz_size_t cost_of_substitution = shorter[next_skew_diagonal_index - i - 2] != longer[i];
  2035. sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
  2036. sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
  2037. next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
  2038. }
  2039. // Don't forget to populate the first row and the fiest column of the Levenshtein matrix.
  2040. next_distances[0] = next_distances[next_skew_diagonal_length - 1] = next_skew_diagonal_index;
  2041. // Perform a circular rotarion of those buffers, to reuse the memory.
  2042. sz_size_t *temporary = previous_distances;
  2043. previous_distances = current_distances;
  2044. current_distances = next_distances;
  2045. next_distances = temporary;
  2046. }
  2047. // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
  2048. // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
  2049. // index on either side, we will be cropping those values out.
  2050. sz_size_t total_diagonals = n + n - 1;
  2051. for (; next_skew_diagonal_index != total_diagonals; ++next_skew_diagonal_index) {
  2052. sz_size_t const next_skew_diagonal_length = total_diagonals - next_skew_diagonal_index;
  2053. for (sz_size_t i = 0; i != next_skew_diagonal_length; ++i) {
  2054. sz_size_t cost_of_substitution =
  2055. shorter[shorter_length - 1 - i] != longer[next_skew_diagonal_index - n + i];
  2056. sz_size_t cost_if_substitution = previous_distances[i] + cost_of_substitution;
  2057. sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
  2058. next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
  2059. }
  2060. // Perform a circular rotarion of those buffers, to reuse the memory, this time, with a shift,
  2061. // dropping the first element in the current array.
  2062. sz_size_t *temporary = previous_distances;
  2063. previous_distances = current_distances + 1;
  2064. current_distances = next_distances;
  2065. next_distances = temporary;
  2066. }
  2067. // Cache scalar before `free` call.
  2068. sz_size_t result = current_distances[0];
  2069. alloc->free(distances, buffer_length, alloc->handle);
  2070. return result;
  2071. }
  2072. /**
  2073. * @brief Describes the length of a UTF8 character / codepoint / rune in bytes.
  2074. */
  2075. typedef enum {
  2076. sz_utf8_invalid_k = 0, //!< Invalid UTF8 character.
  2077. sz_utf8_rune_1byte_k = 1, //!< 1-byte UTF8 character.
  2078. sz_utf8_rune_2bytes_k = 2, //!< 2-byte UTF8 character.
  2079. sz_utf8_rune_3bytes_k = 3, //!< 3-byte UTF8 character.
  2080. sz_utf8_rune_4bytes_k = 4, //!< 4-byte UTF8 character.
  2081. } sz_rune_length_t;
  2082. typedef sz_u32_t sz_rune_t;
  2083. /**
  2084. * @brief Extracts just one UTF8 codepoint from a UTF8 string into a 32-bit unsigned integer.
  2085. */
  2086. SZ_INTERNAL void _sz_extract_utf8_rune(sz_cptr_t utf8, sz_rune_t *code, sz_rune_length_t *code_length) {
  2087. sz_u8_t const *current = (sz_u8_t const *)utf8;
  2088. sz_u8_t leading_byte = *current++;
  2089. sz_rune_t ch;
  2090. sz_rune_length_t ch_length;
  2091. // TODO: This can be made entirely branchless using 32-bit SWAR.
  2092. if (leading_byte < 0x80) {
  2093. // Single-byte rune (0xxxxxxx)
  2094. ch = leading_byte;
  2095. ch_length = sz_utf8_rune_1byte_k;
  2096. }
  2097. else if ((leading_byte & 0xE0) == 0xC0) {
  2098. // Two-byte rune (110xxxxx 10xxxxxx)
  2099. ch = (leading_byte & 0x1F) << 6;
  2100. ch |= (*current++ & 0x3F);
  2101. ch_length = sz_utf8_rune_2bytes_k;
  2102. }
  2103. else if ((leading_byte & 0xF0) == 0xE0) {
  2104. // Three-byte rune (1110xxxx 10xxxxxx 10xxxxxx)
  2105. ch = (leading_byte & 0x0F) << 12;
  2106. ch |= (*current++ & 0x3F) << 6;
  2107. ch |= (*current++ & 0x3F);
  2108. ch_length = sz_utf8_rune_3bytes_k;
  2109. }
  2110. else if ((leading_byte & 0xF8) == 0xF0) {
  2111. // Four-byte rune (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
  2112. ch = (leading_byte & 0x07) << 18;
  2113. ch |= (*current++ & 0x3F) << 12;
  2114. ch |= (*current++ & 0x3F) << 6;
  2115. ch |= (*current++ & 0x3F);
  2116. ch_length = sz_utf8_rune_4bytes_k;
  2117. }
  2118. else {
  2119. // Invalid UTF8 rune.
  2120. ch = 0;
  2121. ch_length = sz_utf8_invalid_k;
  2122. }
  2123. *code = ch;
  2124. *code_length = ch_length;
  2125. }
  2126. /**
  2127. * @brief Exports a UTF8 string into a UTF32 buffer.
  2128. * ! The result is undefined id the UTF8 string is corrupted.
  2129. * @return The length in the number of codepoints.
  2130. */
  2131. SZ_INTERNAL sz_size_t _sz_export_utf8_to_utf32(sz_cptr_t utf8, sz_size_t utf8_length, sz_rune_t *utf32) {
  2132. sz_cptr_t const end = utf8 + utf8_length;
  2133. sz_size_t count = 0;
  2134. sz_rune_length_t rune_length;
  2135. for (; utf8 != end; utf8 += rune_length, utf32++, count++) _sz_extract_utf8_rune(utf8, utf32, &rune_length);
  2136. return count;
  2137. }
  2138. /**
  2139. * @brief Compute the Levenshtein distance between two strings using the Wagner-Fisher algorithm.
  2140. * Stores only 2 rows of the Levenshtein matrix, but uses 64-bit integers for the distance values,
  2141. * and upcasts UTF8 variable-length codepoints to 64-bit integers for faster addressing.
  2142. *
  2143. * ! In the worst case for 2 strings of length 100, that contain just one 16-bit codepoint this will result in extra:
  2144. * + 2 rows * 100 slots * 8 bytes/slot = 1600 bytes of memory for the two rows of the Levenshtein matrix rows.
  2145. * + 100 codepoints * 2 strings * 4 bytes/codepoint = 800 bytes of memory for the UTF8 buffer.
  2146. * = 2400 bytes of memory or @b 12x memory amplification!
  2147. */
  2148. SZ_INTERNAL sz_size_t _sz_edit_distance_wagner_fisher_serial( //
  2149. sz_cptr_t longer, sz_size_t longer_length, //
  2150. sz_cptr_t shorter, sz_size_t shorter_length, //
  2151. sz_size_t bound, sz_bool_t can_be_unicode, sz_memory_allocator_t *alloc) {
  2152. // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
  2153. sz_memory_allocator_t global_alloc;
  2154. if (!alloc) {
  2155. sz_memory_allocator_init_default(&global_alloc);
  2156. alloc = &global_alloc;
  2157. }
  2158. // A good idea may be to dispatch different kernels for different string lengths.
  2159. // Like using `uint8_t` counters for strings under 255 characters long.
  2160. // Good in theory, this results in frequent upcasts and downcasts in serial code.
  2161. // On strings over 20 bytes, using `uint8` over `uint64` on 64-bit x86 CPU doubles the execution time.
  2162. // So one must be very cautious with such optimizations.
  2163. typedef sz_size_t _distance_t;
  2164. // Compute the number of columns in our Levenshtein matrix.
  2165. sz_size_t const n = shorter_length + 1;
  2166. // If a buffering memory-allocator is provided, this operation is practically free,
  2167. // and cheaper than allocating even 512 bytes (for small distance matrices) on stack.
  2168. sz_size_t buffer_length = sizeof(_distance_t) * (n * 2);
  2169. // If the strings contain Unicode characters, let's estimate the max character width,
  2170. // and use it to allocate a larger buffer to decode UTF8.
  2171. if ((can_be_unicode == sz_true_k) &&
  2172. (sz_isascii(longer, longer_length) == sz_false_k || sz_isascii(shorter, shorter_length) == sz_false_k)) {
  2173. buffer_length += (shorter_length + longer_length) * sizeof(sz_rune_t);
  2174. }
  2175. else { can_be_unicode = sz_false_k; }
  2176. // If the allocation fails, return the maximum distance.
  2177. sz_ptr_t const buffer = (sz_ptr_t)alloc->allocate(buffer_length, alloc->handle);
  2178. if (!buffer) return SZ_SIZE_MAX;
  2179. // Let's export the UTF8 sequence into the newly allocated buffer at the end.
  2180. if (can_be_unicode == sz_true_k) {
  2181. sz_rune_t *const longer_utf32 = (sz_rune_t *)(buffer + sizeof(_distance_t) * (n * 2));
  2182. sz_rune_t *const shorter_utf32 = longer_utf32 + longer_length;
  2183. // Export the UTF8 sequences into the newly allocated buffer.
  2184. longer_length = _sz_export_utf8_to_utf32(longer, longer_length, longer_utf32);
  2185. shorter_length = _sz_export_utf8_to_utf32(shorter, shorter_length, shorter_utf32);
  2186. longer = (sz_cptr_t)longer_utf32;
  2187. shorter = (sz_cptr_t)shorter_utf32;
  2188. }
  2189. // Let's parameterize the core logic for different character types and distance types.
  2190. #define _wagner_fisher_unbounded(_distance_t, _char_t) \
  2191. /* Now let's cast our pointer to avoid it in subsequent sections. */ \
  2192. _char_t const *const longer_chars = (_char_t const *)longer; \
  2193. _char_t const *const shorter_chars = (_char_t const *)shorter; \
  2194. _distance_t *previous_distances = (_distance_t *)buffer; \
  2195. _distance_t *current_distances = previous_distances + n; \
  2196. /* Initialize the first row of the Levenshtein matrix with `iota`-style arithmetic progression. */ \
  2197. for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
  2198. /* The main loop of the algorithm with quadratic complexity. */ \
  2199. for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) { \
  2200. _char_t const longer_char = longer_chars[idx_longer]; \
  2201. /* Using pure pointer arithmetic is faster than iterating with an index. */ \
  2202. _char_t const *shorter_ptr = shorter_chars; \
  2203. _distance_t const *previous_ptr = previous_distances; \
  2204. _distance_t *current_ptr = current_distances; \
  2205. _distance_t *const current_end = current_ptr + shorter_length; \
  2206. current_ptr[0] = idx_longer + 1; \
  2207. for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) { \
  2208. _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]); \
  2209. /* We can avoid `+1` for costs here, shifting it to post-minimum computation, */ \
  2210. /* saving one increment operation. */ \
  2211. _distance_t cost_deletion = previous_ptr[1]; \
  2212. _distance_t cost_insertion = current_ptr[0]; \
  2213. /* ? It might be a good idea to enforce branchless execution here. */ \
  2214. /* ? The caveat being that the benchmarks on longer sequences backfire and more research is needed. */ \
  2215. current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1); \
  2216. } \
  2217. /* Swap `previous_distances` and `current_distances` pointers. */ \
  2218. _distance_t *temporary = previous_distances; \
  2219. previous_distances = current_distances; \
  2220. current_distances = temporary; \
  2221. } \
  2222. /* Cache scalar before `free` call. */ \
  2223. sz_size_t result = previous_distances[shorter_length]; \
  2224. alloc->free(buffer, buffer_length, alloc->handle); \
  2225. return result;
  2226. // Let's define a separate variant for bounded distance computation.
  2227. // Practically the same as unbounded, but also collecting the running minimum within each row for early exit.
  2228. #define _wagner_fisher_bounded(_distance_t, _char_t) \
  2229. _char_t const *const longer_chars = (_char_t const *)longer; \
  2230. _char_t const *const shorter_chars = (_char_t const *)shorter; \
  2231. _distance_t *previous_distances = (_distance_t *)buffer; \
  2232. _distance_t *current_distances = previous_distances + n; \
  2233. for (_distance_t idx_shorter = 0; idx_shorter != n; ++idx_shorter) previous_distances[idx_shorter] = idx_shorter; \
  2234. for (_distance_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) { \
  2235. _char_t const longer_char = longer_chars[idx_longer]; \
  2236. _char_t const *shorter_ptr = shorter_chars; \
  2237. _distance_t const *previous_ptr = previous_distances; \
  2238. _distance_t *current_ptr = current_distances; \
  2239. _distance_t *const current_end = current_ptr + shorter_length; \
  2240. current_ptr[0] = idx_longer + 1; \
  2241. /* Initialize min_distance with a value greater than bound */ \
  2242. _distance_t min_distance = bound - 1; \
  2243. for (; current_ptr != current_end; ++previous_ptr, ++current_ptr, ++shorter_ptr) { \
  2244. _distance_t cost_substitution = previous_ptr[0] + (_distance_t)(longer_char != shorter_ptr[0]); \
  2245. _distance_t cost_deletion = previous_ptr[1]; \
  2246. _distance_t cost_insertion = current_ptr[0]; \
  2247. current_ptr[1] = sz_min_of_two(cost_substitution, sz_min_of_two(cost_deletion, cost_insertion) + 1); \
  2248. /* Keep track of the minimum distance seen so far in this row */ \
  2249. min_distance = sz_min_of_two(current_ptr[1], min_distance); \
  2250. } \
  2251. /* If the minimum distance in this row exceeded the bound, return early */ \
  2252. if (min_distance >= bound) { \
  2253. alloc->free(buffer, buffer_length, alloc->handle); \
  2254. return bound; \
  2255. } \
  2256. _distance_t *temporary = previous_distances; \
  2257. previous_distances = current_distances; \
  2258. current_distances = temporary; \
  2259. } \
  2260. sz_size_t result = previous_distances[shorter_length]; \
  2261. alloc->free(buffer, buffer_length, alloc->handle); \
  2262. return sz_min_of_two(result, bound);
  2263. // Dispatch the actual computation.
  2264. if (!bound) {
  2265. if (can_be_unicode == sz_true_k) { _wagner_fisher_unbounded(sz_size_t, sz_rune_t); }
  2266. else { _wagner_fisher_unbounded(sz_size_t, sz_u8_t); }
  2267. }
  2268. else {
  2269. if (can_be_unicode == sz_true_k) { _wagner_fisher_bounded(sz_size_t, sz_rune_t); }
  2270. else { _wagner_fisher_bounded(sz_size_t, sz_u8_t); }
  2271. }
  2272. }
  2273. SZ_PUBLIC sz_size_t sz_edit_distance_serial( //
  2274. sz_cptr_t longer, sz_size_t longer_length, //
  2275. sz_cptr_t shorter, sz_size_t shorter_length, //
  2276. sz_size_t bound, sz_memory_allocator_t *alloc) {
  2277. // Let's make sure that we use the amount proportional to the
  2278. // number of elements in the shorter string, not the larger.
  2279. if (shorter_length > longer_length) {
  2280. sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
  2281. sz_pointer_swap((void **)&longer, (void **)&shorter);
  2282. }
  2283. // Skip the matching prefixes and suffixes, they won't affect the distance.
  2284. for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length;
  2285. longer != a_end && shorter != b_end && *longer == *shorter;
  2286. ++longer, ++shorter, --longer_length, --shorter_length)
  2287. ;
  2288. for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1];
  2289. --longer_length, --shorter_length)
  2290. ;
  2291. // Bounded computations may exit early.
  2292. if (bound) {
  2293. // If one of the strings is empty - the edit distance is equal to the length of the other one.
  2294. if (longer_length == 0) return sz_min_of_two(shorter_length, bound);
  2295. if (shorter_length == 0) return sz_min_of_two(longer_length, bound);
  2296. // If the difference in length is beyond the `bound`, there is no need to check at all.
  2297. if (longer_length - shorter_length > bound) return bound;
  2298. }
  2299. if (shorter_length == 0) return longer_length; // If no mismatches were found - the distance is zero.
  2300. if (shorter_length == longer_length && !bound)
  2301. return _sz_edit_distance_skewed_diagonals_serial(longer, longer_length, shorter, shorter_length, bound, alloc);
  2302. return _sz_edit_distance_wagner_fisher_serial(longer, longer_length, shorter, shorter_length, bound, sz_false_k,
  2303. alloc);
  2304. }
  2305. SZ_PUBLIC sz_ssize_t sz_alignment_score_serial( //
  2306. sz_cptr_t longer, sz_size_t longer_length, //
  2307. sz_cptr_t shorter, sz_size_t shorter_length, //
  2308. sz_error_cost_t const *subs, sz_error_cost_t gap, //
  2309. sz_memory_allocator_t *alloc) {
  2310. // If one of the strings is empty - the edit distance is equal to the length of the other one
  2311. if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
  2312. if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
  2313. // Let's make sure that we use the amount proportional to the
  2314. // number of elements in the shorter string, not the larger.
  2315. if (shorter_length > longer_length) {
  2316. sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
  2317. sz_pointer_swap((void **)&longer, (void **)&shorter);
  2318. }
  2319. // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
  2320. sz_memory_allocator_t global_alloc;
  2321. if (!alloc) {
  2322. sz_memory_allocator_init_default(&global_alloc);
  2323. alloc = &global_alloc;
  2324. }
  2325. sz_size_t n = shorter_length + 1;
  2326. sz_size_t buffer_length = sizeof(sz_ssize_t) * n * 2;
  2327. sz_ssize_t *distances = (sz_ssize_t *)alloc->allocate(buffer_length, alloc->handle);
  2328. sz_ssize_t *previous_distances = distances;
  2329. sz_ssize_t *current_distances = previous_distances + n;
  2330. for (sz_size_t idx_shorter = 0; idx_shorter != n; ++idx_shorter)
  2331. previous_distances[idx_shorter] = (sz_ssize_t)idx_shorter * gap;
  2332. sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
  2333. sz_u8_t const *longer_unsigned = (sz_u8_t const *)longer;
  2334. for (sz_size_t idx_longer = 0; idx_longer != longer_length; ++idx_longer) {
  2335. current_distances[0] = ((sz_ssize_t)idx_longer + 1) * gap;
  2336. // Initialize min_distance with a value greater than bound
  2337. sz_error_cost_t const *a_subs = subs + longer_unsigned[idx_longer] * 256ul;
  2338. for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
  2339. sz_ssize_t cost_deletion = previous_distances[idx_shorter + 1] + gap;
  2340. sz_ssize_t cost_insertion = current_distances[idx_shorter] + gap;
  2341. sz_ssize_t cost_substitution = previous_distances[idx_shorter] + a_subs[shorter_unsigned[idx_shorter]];
  2342. current_distances[idx_shorter + 1] = sz_max_of_three(cost_deletion, cost_insertion, cost_substitution);
  2343. }
  2344. // Swap previous_distances and current_distances pointers
  2345. sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
  2346. }
  2347. // Cache scalar before `free` call.
  2348. sz_ssize_t result = previous_distances[shorter_length];
  2349. alloc->free(distances, buffer_length, alloc->handle);
  2350. return result;
  2351. }
  2352. SZ_PUBLIC sz_size_t sz_hamming_distance_serial( //
  2353. sz_cptr_t a, sz_size_t a_length, //
  2354. sz_cptr_t b, sz_size_t b_length, //
  2355. sz_size_t bound) {
  2356. sz_size_t const min_length = sz_min_of_two(a_length, b_length);
  2357. sz_size_t const max_length = sz_max_of_two(a_length, b_length);
  2358. sz_cptr_t const a_end = a + min_length;
  2359. bound = bound == 0 ? max_length : bound;
  2360. // Walk through both strings using SWAR and counting the number of differing characters.
  2361. sz_size_t distance = max_length - min_length;
  2362. #if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
  2363. if (min_length >= SZ_SWAR_THRESHOLD) {
  2364. sz_u64_vec_t a_vec, b_vec, match_vec;
  2365. for (; a + 8 <= a_end && distance < bound; a += 8, b += 8) {
  2366. a_vec.u64 = sz_u64_load(a).u64;
  2367. b_vec.u64 = sz_u64_load(b).u64;
  2368. match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
  2369. distance += sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
  2370. }
  2371. }
  2372. #endif
  2373. for (; a != a_end && distance < bound; ++a, ++b) { distance += (*a != *b); }
  2374. return sz_min_of_two(distance, bound);
  2375. }
  2376. SZ_PUBLIC sz_size_t sz_hamming_distance_utf8_serial( //
  2377. sz_cptr_t a, sz_size_t a_length, //
  2378. sz_cptr_t b, sz_size_t b_length, //
  2379. sz_size_t bound) {
  2380. sz_cptr_t const a_end = a + a_length;
  2381. sz_cptr_t const b_end = b + b_length;
  2382. sz_size_t distance = 0;
  2383. sz_rune_t a_rune, b_rune;
  2384. sz_rune_length_t a_rune_length, b_rune_length;
  2385. if (bound) {
  2386. for (; a < a_end && b < b_end && distance < bound; a += a_rune_length, b += b_rune_length) {
  2387. _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
  2388. _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
  2389. distance += (a_rune != b_rune);
  2390. }
  2391. // If one string has more runes, we need to go through the tail.
  2392. if (distance < bound) {
  2393. for (; a < a_end && distance < bound; a += a_rune_length, ++distance)
  2394. _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
  2395. for (; b < b_end && distance < bound; b += b_rune_length, ++distance)
  2396. _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
  2397. }
  2398. }
  2399. else {
  2400. for (; a < a_end && b < b_end; a += a_rune_length, b += b_rune_length) {
  2401. _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
  2402. _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
  2403. distance += (a_rune != b_rune);
  2404. }
  2405. // If one string has more runes, we need to go through the tail.
  2406. for (; a < a_end; a += a_rune_length, ++distance) _sz_extract_utf8_rune(a, &a_rune, &a_rune_length);
  2407. for (; b < b_end; b += b_rune_length, ++distance) _sz_extract_utf8_rune(b, &b_rune, &b_rune_length);
  2408. }
  2409. return distance;
  2410. }
  2411. /**
  2412. * @brief Largest prime number that fits into 31 bits.
  2413. * @see https://mersenneforum.org/showthread.php?t=3471
  2414. */
  2415. #define SZ_U32_MAX_PRIME (2147483647u)
  2416. /**
  2417. * @brief Largest prime number that fits into 64 bits.
  2418. * @see https://mersenneforum.org/showthread.php?t=3471
  2419. *
  2420. * 2^64 = 18,446,744,073,709,551,616
  2421. * this = 18,446,744,073,709,551,557
  2422. * diff = 59
  2423. */
  2424. #define SZ_U64_MAX_PRIME (18446744073709551557ull)
  2425. /*
  2426. * One hardware-accelerated way of mixing hashes can be CRC, but it's only implemented for 32-bit values.
  2427. * Using a Boost-like mixer works very poorly in such case:
  2428. *
  2429. * hash_first ^ (hash_second + 0x517cc1b727220a95 + (hash_first << 6) + (hash_first >> 2));
  2430. *
  2431. * Let's stick to the Fibonacci hash trick using the golden ratio.
  2432. * https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
  2433. */
  2434. #define _sz_hash_mix(first, second) ((first * 11400714819323198485ull) ^ (second * 11400714819323198485ull))
  2435. #define _sz_shift_low(x) (x)
  2436. #define _sz_shift_high(x) ((x + 77ull) & 0xFFull)
  2437. #define _sz_prime_mod(x) (x % SZ_U64_MAX_PRIME)
  2438. SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length) {
  2439. sz_u64_t hash_low = 0;
  2440. sz_u64_t hash_high = 0;
  2441. sz_u8_t const *text = (sz_u8_t const *)start;
  2442. sz_u8_t const *text_end = text + length;
  2443. switch (length) {
  2444. case 0: return 0;
  2445. // Texts under 7 bytes long are definitely below the largest prime.
  2446. case 1:
  2447. hash_low = _sz_shift_low(text[0]);
  2448. hash_high = _sz_shift_high(text[0]);
  2449. break;
  2450. case 2:
  2451. hash_low = _sz_shift_low(text[0]) * 31ull + _sz_shift_low(text[1]);
  2452. hash_high = _sz_shift_high(text[0]) * 257ull + _sz_shift_high(text[1]);
  2453. break;
  2454. case 3:
  2455. hash_low = _sz_shift_low(text[0]) * 31ull * 31ull + //
  2456. _sz_shift_low(text[1]) * 31ull + //
  2457. _sz_shift_low(text[2]);
  2458. hash_high = _sz_shift_high(text[0]) * 257ull * 257ull + //
  2459. _sz_shift_high(text[1]) * 257ull + //
  2460. _sz_shift_high(text[2]);
  2461. break;
  2462. case 4:
  2463. hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull + //
  2464. _sz_shift_low(text[1]) * 31ull * 31ull + //
  2465. _sz_shift_low(text[2]) * 31ull + //
  2466. _sz_shift_low(text[3]);
  2467. hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull + //
  2468. _sz_shift_high(text[1]) * 257ull * 257ull + //
  2469. _sz_shift_high(text[2]) * 257ull + //
  2470. _sz_shift_high(text[3]);
  2471. break;
  2472. case 5:
  2473. hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull + //
  2474. _sz_shift_low(text[1]) * 31ull * 31ull * 31ull + //
  2475. _sz_shift_low(text[2]) * 31ull * 31ull + //
  2476. _sz_shift_low(text[3]) * 31ull + //
  2477. _sz_shift_low(text[4]);
  2478. hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull + //
  2479. _sz_shift_high(text[1]) * 257ull * 257ull * 257ull + //
  2480. _sz_shift_high(text[2]) * 257ull * 257ull + //
  2481. _sz_shift_high(text[3]) * 257ull + //
  2482. _sz_shift_high(text[4]);
  2483. break;
  2484. case 6:
  2485. hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
  2486. _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull + //
  2487. _sz_shift_low(text[2]) * 31ull * 31ull * 31ull + //
  2488. _sz_shift_low(text[3]) * 31ull * 31ull + //
  2489. _sz_shift_low(text[4]) * 31ull + //
  2490. _sz_shift_low(text[5]);
  2491. hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
  2492. _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull + //
  2493. _sz_shift_high(text[2]) * 257ull * 257ull * 257ull + //
  2494. _sz_shift_high(text[3]) * 257ull * 257ull + //
  2495. _sz_shift_high(text[4]) * 257ull + //
  2496. _sz_shift_high(text[5]);
  2497. break;
  2498. case 7:
  2499. hash_low = _sz_shift_low(text[0]) * 31ull * 31ull * 31ull * 31ull * 31ull * 31ull + //
  2500. _sz_shift_low(text[1]) * 31ull * 31ull * 31ull * 31ull * 31ull + //
  2501. _sz_shift_low(text[2]) * 31ull * 31ull * 31ull * 31ull + //
  2502. _sz_shift_low(text[3]) * 31ull * 31ull * 31ull + //
  2503. _sz_shift_low(text[4]) * 31ull * 31ull + //
  2504. _sz_shift_low(text[5]) * 31ull + //
  2505. _sz_shift_low(text[6]);
  2506. hash_high = _sz_shift_high(text[0]) * 257ull * 257ull * 257ull * 257ull * 257ull * 257ull + //
  2507. _sz_shift_high(text[1]) * 257ull * 257ull * 257ull * 257ull * 257ull + //
  2508. _sz_shift_high(text[2]) * 257ull * 257ull * 257ull * 257ull + //
  2509. _sz_shift_high(text[3]) * 257ull * 257ull * 257ull + //
  2510. _sz_shift_high(text[4]) * 257ull * 257ull + //
  2511. _sz_shift_high(text[5]) * 257ull + //
  2512. _sz_shift_high(text[6]);
  2513. break;
  2514. default:
  2515. // Unroll the first seven cycles:
  2516. hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
  2517. hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
  2518. hash_low = hash_low * 31ull + _sz_shift_low(text[1]);
  2519. hash_high = hash_high * 257ull + _sz_shift_high(text[1]);
  2520. hash_low = hash_low * 31ull + _sz_shift_low(text[2]);
  2521. hash_high = hash_high * 257ull + _sz_shift_high(text[2]);
  2522. hash_low = hash_low * 31ull + _sz_shift_low(text[3]);
  2523. hash_high = hash_high * 257ull + _sz_shift_high(text[3]);
  2524. hash_low = hash_low * 31ull + _sz_shift_low(text[4]);
  2525. hash_high = hash_high * 257ull + _sz_shift_high(text[4]);
  2526. hash_low = hash_low * 31ull + _sz_shift_low(text[5]);
  2527. hash_high = hash_high * 257ull + _sz_shift_high(text[5]);
  2528. hash_low = hash_low * 31ull + _sz_shift_low(text[6]);
  2529. hash_high = hash_high * 257ull + _sz_shift_high(text[6]);
  2530. text += 7;
  2531. // Iterate throw the rest with the modulus:
  2532. for (; text != text_end; ++text) {
  2533. hash_low = hash_low * 31ull + _sz_shift_low(text[0]);
  2534. hash_high = hash_high * 257ull + _sz_shift_high(text[0]);
  2535. // Wrap the hashes around:
  2536. hash_low = _sz_prime_mod(hash_low);
  2537. hash_high = _sz_prime_mod(hash_high);
  2538. }
  2539. break;
  2540. }
  2541. return _sz_hash_mix(hash_low, hash_high);
  2542. }
  2543. SZ_PUBLIC void sz_hashes_serial(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
  2544. sz_hash_callback_t callback, void *callback_handle) {
  2545. if (length < window_length || !window_length) return;
  2546. sz_u8_t const *text = (sz_u8_t const *)start;
  2547. sz_u8_t const *text_end = text + length;
  2548. // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
  2549. sz_u64_t prime_power_low = 1, prime_power_high = 1;
  2550. for (sz_size_t i = 0; i + 1 < window_length; ++i)
  2551. prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
  2552. prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
  2553. // Compute the initial hash value for the first window.
  2554. sz_u64_t hash_low = 0, hash_high = 0, hash_mix;
  2555. for (sz_u8_t const *first_end = text + window_length; text < first_end; ++text)
  2556. hash_low = (hash_low * 31ull + _sz_shift_low(*text)) % SZ_U64_MAX_PRIME,
  2557. hash_high = (hash_high * 257ull + _sz_shift_high(*text)) % SZ_U64_MAX_PRIME;
  2558. // In most cases the fingerprint length will be a power of two.
  2559. hash_mix = _sz_hash_mix(hash_low, hash_high);
  2560. callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
  2561. // Compute the hash value for every window, exporting into the fingerprint,
  2562. // using the expensive modulo operation.
  2563. sz_size_t cycles = 1;
  2564. sz_size_t const step_mask = step - 1;
  2565. for (; text < text_end; ++text, ++cycles) {
  2566. // Discard one character:
  2567. hash_low -= _sz_shift_low(*(text - window_length)) * prime_power_low;
  2568. hash_high -= _sz_shift_high(*(text - window_length)) * prime_power_high;
  2569. // And add a new one:
  2570. hash_low = 31ull * hash_low + _sz_shift_low(*text);
  2571. hash_high = 257ull * hash_high + _sz_shift_high(*text);
  2572. // Wrap the hashes around:
  2573. hash_low = _sz_prime_mod(hash_low);
  2574. hash_high = _sz_prime_mod(hash_high);
  2575. // Mix only if we've skipped enough hashes.
  2576. if ((cycles & step_mask) == 0) {
  2577. hash_mix = _sz_hash_mix(hash_low, hash_high);
  2578. callback((sz_cptr_t)text, window_length, hash_mix, callback_handle);
  2579. }
  2580. }
  2581. }
  2582. #undef _sz_shift_low
  2583. #undef _sz_shift_high
  2584. #undef _sz_hash_mix
  2585. #undef _sz_prime_mod
  2586. /**
  2587. * @brief Uses a small lookup-table to convert a lowercase character to uppercase.
  2588. */
  2589. SZ_INTERNAL sz_u8_t sz_u8_tolower(sz_u8_t c) {
  2590. static sz_u8_t const lowered[256] = {
  2591. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
  2592. 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, //
  2593. 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, //
  2594. 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, //
  2595. 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
  2596. 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, //
  2597. 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
  2598. 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
  2599. 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
  2600. 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
  2601. 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
  2602. 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
  2603. 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
  2604. 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
  2605. 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
  2606. 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
  2607. };
  2608. return lowered[c];
  2609. }
  2610. /**
  2611. * @brief Uses a small lookup-table to convert an uppercase character to lowercase.
  2612. */
  2613. SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
  2614. static sz_u8_t const upped[256] = {
  2615. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
  2616. 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, //
  2617. 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, //
  2618. 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, //
  2619. 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
  2620. 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, //
  2621. 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, //
  2622. 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, //
  2623. 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
  2624. 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
  2625. 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
  2626. 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
  2627. 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
  2628. 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
  2629. 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
  2630. 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
  2631. };
  2632. return upped[c];
  2633. }
  2634. /**
  2635. * @brief Uses two small lookup tables (768 bytes total) to accelerate division by a small
  2636. * unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
  2637. *
  2638. * @param divisor Integral value @b larger than one.
  2639. * @param number Integral value to divide.
  2640. */
  2641. SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
  2642. sz_assert(divisor > 1);
  2643. static sz_u16_t const multipliers[256] = {
  2644. 0, 0, 0, 21846, 0, 39322, 21846, 9363, 0, 50973, 39322, 29790, 21846, 15124, 9363, 4370,
  2645. 0, 57826, 50973, 44841, 39322, 34329, 29790, 25645, 21846, 18351, 15124, 12137, 9363, 6780, 4370, 2115,
  2646. 0, 61565, 57826, 54302, 50973, 47824, 44841, 42011, 39322, 36765, 34329, 32006, 29790, 27671, 25645, 23705,
  2647. 21846, 20063, 18351, 16706, 15124, 13602, 12137, 10725, 9363, 8049, 6780, 5554, 4370, 3224, 2115, 1041,
  2648. 0, 63520, 61565, 59668, 57826, 56039, 54302, 52614, 50973, 49377, 47824, 46313, 44841, 43407, 42011, 40649,
  2649. 39322, 38028, 36765, 35532, 34329, 33154, 32006, 30885, 29790, 28719, 27671, 26647, 25645, 24665, 23705, 22766,
  2650. 21846, 20945, 20063, 19198, 18351, 17520, 16706, 15907, 15124, 14356, 13602, 12863, 12137, 11424, 10725, 10038,
  2651. 9363, 8700, 8049, 7409, 6780, 6162, 5554, 4957, 4370, 3792, 3224, 2665, 2115, 1573, 1041, 517,
  2652. 0, 64520, 63520, 62535, 61565, 60609, 59668, 58740, 57826, 56926, 56039, 55164, 54302, 53452, 52614, 51788,
  2653. 50973, 50169, 49377, 48595, 47824, 47063, 46313, 45572, 44841, 44120, 43407, 42705, 42011, 41326, 40649, 39982,
  2654. 39322, 38671, 38028, 37392, 36765, 36145, 35532, 34927, 34329, 33738, 33154, 32577, 32006, 31443, 30885, 30334,
  2655. 29790, 29251, 28719, 28192, 27671, 27156, 26647, 26143, 25645, 25152, 24665, 24182, 23705, 23233, 22766, 22303,
  2656. 21846, 21393, 20945, 20502, 20063, 19628, 19198, 18772, 18351, 17933, 17520, 17111, 16706, 16305, 15907, 15514,
  2657. 15124, 14738, 14356, 13977, 13602, 13231, 12863, 12498, 12137, 11779, 11424, 11073, 10725, 10380, 10038, 9699,
  2658. 9363, 9030, 8700, 8373, 8049, 7727, 7409, 7093, 6780, 6470, 6162, 5857, 5554, 5254, 4957, 4662,
  2659. 4370, 4080, 3792, 3507, 3224, 2943, 2665, 2388, 2115, 1843, 1573, 1306, 1041, 778, 517, 258,
  2660. };
  2661. // This table can be avoided using a single addition and counting trailing zeros.
  2662. static sz_u8_t const shifts[256] = {
  2663. 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
  2664. 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
  2665. 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
  2666. 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, //
  2667. 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
  2668. 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
  2669. 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
  2670. 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, //
  2671. };
  2672. sz_u32_t multiplier = multipliers[divisor];
  2673. sz_u8_t shift = shifts[divisor];
  2674. sz_u16_t q = (sz_u16_t)((multiplier * number) >> 16);
  2675. sz_u16_t t = ((number - q) >> 1) + q;
  2676. return (sz_u8_t)(t >> shift);
  2677. }
  2678. SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
  2679. sz_u8_t *unsigned_result = (sz_u8_t *)result;
  2680. sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
  2681. sz_u8_t const *end = unsigned_text + length;
  2682. for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
  2683. }
  2684. SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
  2685. sz_u8_t *unsigned_result = (sz_u8_t *)result;
  2686. sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
  2687. sz_u8_t const *end = unsigned_text + length;
  2688. for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
  2689. }
  2690. SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
  2691. sz_u8_t *unsigned_result = (sz_u8_t *)result;
  2692. sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
  2693. sz_u8_t const *end = unsigned_text + length;
  2694. for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
  2695. }
  2696. /**
  2697. * @brief Check if there is a byte in this buffer, that exceeds 127 and can't be an ASCII character.
  2698. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  2699. */
  2700. SZ_PUBLIC sz_bool_t sz_isascii_serial(sz_cptr_t text, sz_size_t length) {
  2701. if (!length) return sz_true_k;
  2702. sz_u8_t const *h = (sz_u8_t const *)text;
  2703. sz_u8_t const *const h_end = h + length;
  2704. #if !SZ_USE_MISALIGNED_LOADS
  2705. // Process the misaligned head, to void UB on unaligned 64-bit loads.
  2706. for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
  2707. if (*h & 0x80ull) return sz_false_k;
  2708. #endif
  2709. // Validate eight bytes at once using SWAR.
  2710. sz_u64_vec_t text_vec;
  2711. for (; h + 8 <= h_end; h += 8) {
  2712. text_vec.u64 = *(sz_u64_t const *)h;
  2713. if (text_vec.u64 & 0x8080808080808080ull) return sz_false_k;
  2714. }
  2715. // Handle the misaligned tail.
  2716. for (; h < h_end; ++h)
  2717. if (*h & 0x80ull) return sz_false_k;
  2718. return sz_true_k;
  2719. }
  2720. SZ_PUBLIC void sz_generate_serial(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
  2721. sz_random_generator_t generator, void *generator_user_data) {
  2722. sz_assert(alphabet_size > 0 && alphabet_size <= 256 && "Inadequate alphabet size");
  2723. if (alphabet_size == 1) sz_fill(result, result_length, *alphabet);
  2724. else {
  2725. sz_assert(generator && "Expects a valid random generator");
  2726. sz_u8_t divisor = (sz_u8_t)alphabet_size;
  2727. for (sz_cptr_t end = result + result_length; result != end; ++result) {
  2728. sz_u8_t random = generator(generator_user_data) & 0xFF;
  2729. sz_u8_t quotient = sz_u8_divide(random, divisor);
  2730. *result = alphabet[random - quotient * divisor];
  2731. }
  2732. }
  2733. }
  2734. #pragma endregion
  2735. /*
  2736. * Serial implementation of string class operations.
  2737. */
  2738. #pragma region Serial Implementation for the String Class
  2739. SZ_PUBLIC sz_bool_t sz_string_is_on_stack(sz_string_t const *string) {
  2740. // It doesn't matter if it's on stack or heap, the pointer location is the same.
  2741. return (sz_bool_t)((sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0]);
  2742. }
  2743. SZ_PUBLIC void sz_string_range(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length) {
  2744. sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
  2745. sz_size_t is_big_mask = is_small - 1ull;
  2746. *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
  2747. // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
  2748. *length = string->external.length & (0x00000000000000FFull | is_big_mask);
  2749. }
  2750. SZ_PUBLIC void sz_string_unpack(sz_string_t const *string, sz_ptr_t *start, sz_size_t *length, sz_size_t *space,
  2751. sz_bool_t *is_external) {
  2752. sz_size_t is_small = (sz_cptr_t)string->internal.start == (sz_cptr_t)&string->internal.chars[0];
  2753. sz_size_t is_big_mask = is_small - 1ull;
  2754. *start = string->external.start; // It doesn't matter if it's on stack or heap, the pointer location is the same.
  2755. // If the string is small, use branch-less approach to mask-out the top 7 bytes of the length.
  2756. *length = string->external.length & (0x00000000000000FFull | is_big_mask);
  2757. // In case the string is small, the `is_small - 1ull` will become 0xFFFFFFFFFFFFFFFFull.
  2758. *space = sz_u64_blend(SZ_STRING_INTERNAL_SPACE, string->external.space, is_big_mask);
  2759. *is_external = (sz_bool_t)!is_small;
  2760. }
  2761. SZ_PUBLIC sz_bool_t sz_string_equal(sz_string_t const *a, sz_string_t const *b) {
  2762. // Tempting to say that the external.length is bitwise the same even if it includes
  2763. // some bytes of the on-stack payload, but we don't at this writing maintain that invariant.
  2764. // (An on-stack string includes noise bytes in the high-order bits of external.length. So do this
  2765. // the hard/correct way.
  2766. #if SZ_USE_MISALIGNED_LOADS
  2767. // Dealing with StringZilla strings, we know that the `start` pointer always points
  2768. // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
  2769. #endif
  2770. // Alternatively, fall back to byte-by-byte comparison.
  2771. sz_ptr_t a_start, b_start;
  2772. sz_size_t a_length, b_length;
  2773. sz_string_range(a, &a_start, &a_length);
  2774. sz_string_range(b, &b_start, &b_length);
  2775. return (sz_bool_t)(a_length == b_length && sz_equal(a_start, b_start, b_length));
  2776. }
  2777. SZ_PUBLIC sz_ordering_t sz_string_order(sz_string_t const *a, sz_string_t const *b) {
  2778. #if SZ_USE_MISALIGNED_LOADS
  2779. // Dealing with StringZilla strings, we know that the `start` pointer always points
  2780. // to a word at least 8 bytes long. Therefore, we can compare the first 8 bytes at once.
  2781. #endif
  2782. // Alternatively, fall back to byte-by-byte comparison.
  2783. sz_ptr_t a_start, b_start;
  2784. sz_size_t a_length, b_length;
  2785. sz_string_range(a, &a_start, &a_length);
  2786. sz_string_range(b, &b_start, &b_length);
  2787. return sz_order(a_start, a_length, b_start, b_length);
  2788. }
  2789. SZ_PUBLIC void sz_string_init(sz_string_t *string) {
  2790. sz_assert(string && "String can't be SZ_NULL.");
  2791. // Only 8 + 1 + 1 need to be initialized.
  2792. string->internal.start = &string->internal.chars[0];
  2793. // But for safety let's initialize the entire structure to zeros.
  2794. // string->internal.chars[0] = 0;
  2795. // string->internal.length = 0;
  2796. string->words[1] = 0;
  2797. string->words[2] = 0;
  2798. string->words[3] = 0;
  2799. }
  2800. SZ_PUBLIC sz_ptr_t sz_string_init_length(sz_string_t *string, sz_size_t length, sz_memory_allocator_t *allocator) {
  2801. sz_size_t space_needed = length + 1; // space for trailing \0
  2802. sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
  2803. // Initialize the string to zeros for safety.
  2804. string->words[1] = 0;
  2805. string->words[2] = 0;
  2806. string->words[3] = 0;
  2807. // If we are lucky, no memory allocations will be needed.
  2808. if (space_needed <= SZ_STRING_INTERNAL_SPACE) {
  2809. string->internal.start = &string->internal.chars[0];
  2810. string->internal.length = (sz_u8_t)length;
  2811. }
  2812. else {
  2813. // If we are not lucky, we need to allocate memory.
  2814. string->external.start = (sz_ptr_t)allocator->allocate(space_needed, allocator->handle);
  2815. if (!string->external.start) return SZ_NULL_CHAR;
  2816. string->external.length = length;
  2817. string->external.space = space_needed;
  2818. }
  2819. sz_assert(&string->internal.start == &string->external.start && "Alignment confusion");
  2820. string->external.start[length] = 0;
  2821. return string->external.start;
  2822. }
  2823. SZ_PUBLIC sz_ptr_t sz_string_reserve(sz_string_t *string, sz_size_t new_capacity, sz_memory_allocator_t *allocator) {
  2824. sz_assert(string && "String can't be SZ_NULL.");
  2825. sz_size_t new_space = new_capacity + 1;
  2826. if (new_space <= SZ_STRING_INTERNAL_SPACE) return string->external.start;
  2827. sz_ptr_t string_start;
  2828. sz_size_t string_length;
  2829. sz_size_t string_space;
  2830. sz_bool_t string_is_external;
  2831. sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
  2832. sz_assert(new_space > string_space && "New space must be larger than current.");
  2833. sz_ptr_t new_start = (sz_ptr_t)allocator->allocate(new_space, allocator->handle);
  2834. if (!new_start) return SZ_NULL_CHAR;
  2835. sz_copy(new_start, string_start, string_length);
  2836. string->external.start = new_start;
  2837. string->external.space = new_space;
  2838. string->external.padding = 0;
  2839. string->external.length = string_length;
  2840. // Deallocate the old string.
  2841. if (string_is_external) allocator->free(string_start, string_space, allocator->handle);
  2842. return string->external.start;
  2843. }
  2844. SZ_PUBLIC sz_ptr_t sz_string_expand(sz_string_t *string, sz_size_t offset, sz_size_t added_length,
  2845. sz_memory_allocator_t *allocator) {
  2846. sz_assert(string && allocator && "String and allocator can't be SZ_NULL.");
  2847. sz_ptr_t string_start;
  2848. sz_size_t string_length;
  2849. sz_size_t string_space;
  2850. sz_bool_t string_is_external;
  2851. sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
  2852. // The user intended to extend the string.
  2853. offset = sz_min_of_two(offset, string_length);
  2854. // If we are lucky, no memory allocations will be needed.
  2855. if (offset + string_length + added_length < string_space) {
  2856. sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
  2857. string_start[string_length + added_length] = 0;
  2858. // Even if the string is on the stack, the `+=` won't affect the tail of the string.
  2859. string->external.length += added_length;
  2860. }
  2861. // If we are not lucky, we need to allocate more memory.
  2862. else {
  2863. sz_size_t next_planned_size = sz_max_of_two(SZ_CACHE_LINE_WIDTH, string_space * 2ull);
  2864. sz_size_t min_needed_space = sz_size_bit_ceil(offset + string_length + added_length + 1);
  2865. sz_size_t new_space = sz_max_of_two(min_needed_space, next_planned_size);
  2866. string_start = sz_string_reserve(string, new_space - 1, allocator);
  2867. if (!string_start) return SZ_NULL_CHAR;
  2868. // Copy into the new buffer.
  2869. sz_move(string_start + offset + added_length, string_start + offset, string_length - offset);
  2870. string_start[string_length + added_length] = 0;
  2871. string->external.length = string_length + added_length;
  2872. }
  2873. return string_start;
  2874. }
  2875. SZ_PUBLIC sz_size_t sz_string_erase(sz_string_t *string, sz_size_t offset, sz_size_t length) {
  2876. sz_assert(string && "String can't be SZ_NULL.");
  2877. sz_ptr_t string_start;
  2878. sz_size_t string_length;
  2879. sz_size_t string_space;
  2880. sz_bool_t string_is_external;
  2881. sz_string_unpack(string, &string_start, &string_length, &string_space, &string_is_external);
  2882. // Normalize the offset, it can't be larger than the length.
  2883. offset = sz_min_of_two(offset, string_length);
  2884. // We shouldn't normalize the length, to avoid overflowing on `offset + length >= string_length`,
  2885. // if receiving `length == SZ_SIZE_MAX`. After following expression the `length` will contain
  2886. // exactly the delta between original and final length of this `string`.
  2887. length = sz_min_of_two(length, string_length - offset);
  2888. // There are 2 common cases, that wouldn't even require a `memmove`:
  2889. // 1. Erasing the entire contents of the string.
  2890. // In that case `length` argument will be equal or greater than `length` member.
  2891. // 2. Removing the tail of the string with something like `string.pop_back()` in C++.
  2892. //
  2893. // In both of those, regardless of the location of the string - stack or heap,
  2894. // the erasing is as easy as setting the length to the offset.
  2895. // In every other case, we must `memmove` the tail of the string to the left.
  2896. if (offset + length < string_length)
  2897. sz_move(string_start + offset, string_start + offset + length, string_length - offset - length);
  2898. // The `string->external.length = offset` assignment would discard last characters
  2899. // of the on-the-stack string, but inplace subtraction would work.
  2900. string->external.length -= length;
  2901. string_start[string_length - length] = 0;
  2902. return length;
  2903. }
  2904. SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {
  2905. if (!sz_string_is_on_stack(string))
  2906. allocator->free(string->external.start, string->external.space, allocator->handle);
  2907. sz_string_init(string);
  2908. }
  2909. SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
  2910. sz_ptr_t end = target + length;
  2911. // Dealing with short strings, a single sequential pass would be faster.
  2912. // If the size is larger than 2 words, then at least 1 of them will be aligned.
  2913. // But just one aligned word may not be worth SWAR.
  2914. if (length < SZ_SWAR_THRESHOLD)
  2915. while (target != end) *(target++) = value;
  2916. // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
  2917. else {
  2918. sz_u64_t value64 = (sz_u64_t)(value) * 0x0101010101010101ull;
  2919. while ((sz_size_t)target & 7ull) *(target++) = value;
  2920. while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
  2921. while (target != end) *(target++) = value;
  2922. }
  2923. }
  2924. SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
  2925. #if SZ_USE_MISALIGNED_LOADS
  2926. while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
  2927. #endif
  2928. while (length--) *(target++) = *(source++);
  2929. }
  2930. SZ_PUBLIC void sz_move_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
  2931. // Implementing `memmove` is trickier, than `memcpy`, as the ranges may overlap.
  2932. // Existing implementations often have two passes, in normal and reversed order,
  2933. // depending on the relation of `target` and `source` addresses.
  2934. // https://student.cs.uwaterloo.ca/~cs350/common/os161-src-html/doxygen/html/memmove_8c_source.html
  2935. // https://marmota.medium.com/c-language-making-memmove-def8792bb8d5
  2936. //
  2937. // We can use the `memcpy` like left-to-right pass if we know that the `target` is before `source`.
  2938. // Or if we know that they don't intersect! In that case the traversal order is irrelevant,
  2939. // but older CPUs may predict and fetch forward-passes better.
  2940. if (target < source || target >= source + length) {
  2941. #if SZ_USE_MISALIGNED_LOADS
  2942. while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)(source), target += 8, source += 8, length -= 8;
  2943. #endif
  2944. while (length--) *(target++) = *(source++);
  2945. }
  2946. else {
  2947. // Jump to the end and walk backwards.
  2948. target += length, source += length;
  2949. #if SZ_USE_MISALIGNED_LOADS
  2950. while (length >= 8) *(sz_u64_t *)(target -= 8) = *(sz_u64_t const *)(source -= 8), length -= 8;
  2951. #endif
  2952. while (length--) *(--target) = *(--source);
  2953. }
  2954. }
  2955. #pragma endregion
  2956. /*
  2957. * @brief Serial implementation for strings sequence processing.
  2958. */
  2959. #pragma region Serial Implementation for Sequences
  2960. SZ_PUBLIC sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
  2961. sz_size_t matches = 0;
  2962. while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
  2963. for (sz_size_t i = matches + 1; i < sequence->count; ++i)
  2964. if (predicate(sequence, sequence->order[i]))
  2965. sz_u64_swap(sequence->order + i, sequence->order + matches), ++matches;
  2966. return matches;
  2967. }
  2968. SZ_PUBLIC void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
  2969. sz_size_t start_b = partition + 1;
  2970. // If the direct merge is already sorted
  2971. if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
  2972. sz_size_t start_a = 0;
  2973. while (start_a <= partition && start_b <= sequence->count) {
  2974. // If element 1 is in right place
  2975. if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
  2976. else {
  2977. sz_size_t value = sequence->order[start_b];
  2978. sz_size_t index = start_b;
  2979. // Shift all the elements between element 1
  2980. // element 2, right by 1.
  2981. while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
  2982. sequence->order[start_a] = value;
  2983. // Update all the pointers
  2984. start_a++;
  2985. partition++;
  2986. start_b++;
  2987. }
  2988. }
  2989. }
  2990. SZ_PUBLIC void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
  2991. sz_u64_t *keys = sequence->order;
  2992. sz_size_t keys_count = sequence->count;
  2993. for (sz_size_t i = 1; i < keys_count; i++) {
  2994. sz_u64_t i_key = keys[i];
  2995. sz_size_t j = i;
  2996. for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
  2997. keys[j] = i_key;
  2998. }
  2999. }
  3000. SZ_INTERNAL void _sz_sift_down(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start,
  3001. sz_size_t end) {
  3002. sz_size_t root = start;
  3003. while (2 * root + 1 <= end) {
  3004. sz_size_t child = 2 * root + 1;
  3005. if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
  3006. if (!less(sequence, order[root], order[child])) { return; }
  3007. sz_u64_swap(order + root, order + child);
  3008. root = child;
  3009. }
  3010. }
  3011. SZ_INTERNAL void _sz_heapify(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t count) {
  3012. sz_size_t start = (count - 2) / 2;
  3013. while (1) {
  3014. _sz_sift_down(sequence, less, order, start, count - 1);
  3015. if (start == 0) return;
  3016. start--;
  3017. }
  3018. }
  3019. SZ_INTERNAL void _sz_heapsort(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last) {
  3020. sz_u64_t *order = sequence->order;
  3021. sz_size_t count = last - first;
  3022. _sz_heapify(sequence, less, order + first, count);
  3023. sz_size_t end = count - 1;
  3024. while (end > 0) {
  3025. sz_u64_swap(order + first, order + first + end);
  3026. end--;
  3027. _sz_sift_down(sequence, less, order + first, 0, end);
  3028. }
  3029. }
  3030. SZ_PUBLIC void sz_sort_introsort_recursion(sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first,
  3031. sz_size_t last, sz_size_t depth) {
  3032. sz_size_t length = last - first;
  3033. switch (length) {
  3034. case 0:
  3035. case 1: return;
  3036. case 2:
  3037. if (less(sequence, sequence->order[first + 1], sequence->order[first]))
  3038. sz_u64_swap(&sequence->order[first], &sequence->order[first + 1]);
  3039. return;
  3040. case 3: {
  3041. sz_u64_t a = sequence->order[first];
  3042. sz_u64_t b = sequence->order[first + 1];
  3043. sz_u64_t c = sequence->order[first + 2];
  3044. if (less(sequence, b, a)) sz_u64_swap(&a, &b);
  3045. if (less(sequence, c, b)) sz_u64_swap(&c, &b);
  3046. if (less(sequence, b, a)) sz_u64_swap(&a, &b);
  3047. sequence->order[first] = a;
  3048. sequence->order[first + 1] = b;
  3049. sequence->order[first + 2] = c;
  3050. return;
  3051. }
  3052. }
  3053. // Until a certain length, the quadratic-complexity insertion-sort is fine
  3054. if (length <= 16) {
  3055. sz_sequence_t sub_seq = *sequence;
  3056. sub_seq.order += first;
  3057. sub_seq.count = length;
  3058. sz_sort_insertion(&sub_seq, less);
  3059. return;
  3060. }
  3061. // Fallback to N-logN-complexity heap-sort
  3062. if (depth == 0) {
  3063. _sz_heapsort(sequence, less, first, last);
  3064. return;
  3065. }
  3066. --depth;
  3067. // Median-of-three logic to choose pivot
  3068. sz_size_t median = first + length / 2;
  3069. if (less(sequence, sequence->order[median], sequence->order[first]))
  3070. sz_u64_swap(&sequence->order[first], &sequence->order[median]);
  3071. if (less(sequence, sequence->order[last - 1], sequence->order[first]))
  3072. sz_u64_swap(&sequence->order[first], &sequence->order[last - 1]);
  3073. if (less(sequence, sequence->order[median], sequence->order[last - 1]))
  3074. sz_u64_swap(&sequence->order[median], &sequence->order[last - 1]);
  3075. // Partition using the median-of-three as the pivot
  3076. sz_u64_t pivot = sequence->order[median];
  3077. sz_size_t left = first;
  3078. sz_size_t right = last - 1;
  3079. while (1) {
  3080. while (less(sequence, sequence->order[left], pivot)) left++;
  3081. while (less(sequence, pivot, sequence->order[right])) right--;
  3082. if (left >= right) break;
  3083. sz_u64_swap(&sequence->order[left], &sequence->order[right]);
  3084. left++;
  3085. right--;
  3086. }
  3087. // Recursively sort the partitions
  3088. sz_sort_introsort_recursion(sequence, less, first, left, depth);
  3089. sz_sort_introsort_recursion(sequence, less, right + 1, last, depth);
  3090. }
  3091. SZ_PUBLIC void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
  3092. if (sequence->count == 0) return;
  3093. sz_size_t size_is_not_power_of_two = (sequence->count & (sequence->count - 1)) != 0;
  3094. sz_size_t depth_limit = sz_size_log2i_nonzero(sequence->count) + size_is_not_power_of_two;
  3095. sz_sort_introsort_recursion(sequence, less, 0, sequence->count, depth_limit);
  3096. }
  3097. SZ_PUBLIC void sz_sort_recursion( //
  3098. sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, sz_sequence_comparator_t comparator,
  3099. sz_size_t partial_order_length) {
  3100. if (!sequence->count) return;
  3101. // Array of size one doesn't need sorting - only needs the prefix to be discarded.
  3102. if (sequence->count == 1) {
  3103. sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
  3104. order_half_words[1] = 0;
  3105. return;
  3106. }
  3107. // Partition a range of integers according to a specific bit value
  3108. sz_size_t split = 0;
  3109. sz_u64_t mask = (1ull << 63) >> bit_idx;
  3110. // The clean approach would be to perform a single pass over the sequence.
  3111. //
  3112. // while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
  3113. // for (sz_size_t i = split + 1; i < sequence->count; ++i)
  3114. // if (!(sequence->order[i] & mask)) sz_u64_swap(sequence->order + i, sequence->order + split), ++split;
  3115. //
  3116. // This, however, doesn't take into account the high relative cost of writes and swaps.
  3117. // To circumvent that, we can first count the total number entries to be mapped into either part.
  3118. // And then walk through both parts, swapping the entries that are in the wrong part.
  3119. // This would often lead to ~15% performance gain.
  3120. sz_size_t count_with_bit_set = 0;
  3121. for (sz_size_t i = 0; i != sequence->count; ++i) count_with_bit_set += (sequence->order[i] & mask) != 0;
  3122. split = sequence->count - count_with_bit_set;
  3123. // It's possible that the sequence is already partitioned.
  3124. if (split != 0 && split != sequence->count) {
  3125. // Use two pointers to efficiently reposition elements.
  3126. // On pointer walks left-to-right from the start, and the other walks right-to-left from the end.
  3127. sz_size_t left = 0;
  3128. sz_size_t right = sequence->count - 1;
  3129. while (1) {
  3130. // Find the next element with the bit set on the left side.
  3131. while (left < split && !(sequence->order[left] & mask)) ++left;
  3132. // Find the next element without the bit set on the right side.
  3133. while (right >= split && (sequence->order[right] & mask)) --right;
  3134. // Swap the mispositioned elements.
  3135. if (left < split && right >= split) {
  3136. sz_u64_swap(sequence->order + left, sequence->order + right);
  3137. ++left;
  3138. --right;
  3139. }
  3140. else { break; }
  3141. }
  3142. }
  3143. // Go down recursively.
  3144. if (bit_idx < bit_max) {
  3145. sz_sequence_t a = *sequence;
  3146. a.count = split;
  3147. sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
  3148. sz_sequence_t b = *sequence;
  3149. b.order += split;
  3150. b.count -= split;
  3151. sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
  3152. }
  3153. // Reached the end of recursion.
  3154. else {
  3155. // Discard the prefixes.
  3156. sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
  3157. for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
  3158. sz_sequence_t a = *sequence;
  3159. a.count = split;
  3160. sz_sort_introsort(&a, comparator);
  3161. sz_sequence_t b = *sequence;
  3162. b.order += split;
  3163. b.count -= split;
  3164. sz_sort_introsort(&b, comparator);
  3165. }
  3166. }
  3167. SZ_INTERNAL sz_bool_t _sz_sort_is_less(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
  3168. sz_cptr_t i_str = sequence->get_start(sequence, i_key);
  3169. sz_cptr_t j_str = sequence->get_start(sequence, j_key);
  3170. sz_size_t i_len = sequence->get_length(sequence, i_key);
  3171. sz_size_t j_len = sequence->get_length(sequence, j_key);
  3172. return (sz_bool_t)(sz_order_serial(i_str, i_len, j_str, j_len) == sz_less_k);
  3173. }
  3174. SZ_PUBLIC void sz_sort_partial(sz_sequence_t *sequence, sz_size_t partial_order_length) {
  3175. #if SZ_DETECT_BIG_ENDIAN
  3176. // TODO: Implement partial sort for big-endian systems. For now this sorts the whole thing.
  3177. sz_unused(partial_order_length);
  3178. sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
  3179. #else
  3180. // Export up to 4 bytes into the `sequence` bits themselves
  3181. for (sz_size_t i = 0; i != sequence->count; ++i) {
  3182. sz_cptr_t begin = sequence->get_start(sequence, sequence->order[i]);
  3183. sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
  3184. length = length > 4u ? 4u : length;
  3185. sz_ptr_t prefix = (sz_ptr_t)&sequence->order[i];
  3186. for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
  3187. }
  3188. // Perform optionally-parallel radix sort on them
  3189. sz_sort_recursion(sequence, 0, 32, (sz_sequence_comparator_t)_sz_sort_is_less, partial_order_length);
  3190. #endif
  3191. }
  3192. SZ_PUBLIC void sz_sort(sz_sequence_t *sequence) {
  3193. #if SZ_DETECT_BIG_ENDIAN
  3194. sz_sort_introsort(sequence, (sz_sequence_comparator_t)_sz_sort_is_less);
  3195. #else
  3196. sz_sort_partial(sequence, sequence->count);
  3197. #endif
  3198. }
  3199. #pragma endregion
  3200. /*
  3201. * @brief AVX2 implementation of the string search algorithms.
  3202. * Very minimalistic, but still faster than the serial implementation.
  3203. */
  3204. #pragma region AVX2 Implementation
  3205. #if SZ_USE_X86_AVX2
  3206. #pragma GCC push_options
  3207. #pragma GCC target("avx2")
  3208. #pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
  3209. #include <immintrin.h>
  3210. /**
  3211. * @brief Helper structure to simplify work with 256-bit registers.
  3212. */
  3213. typedef union sz_u256_vec_t {
  3214. __m256i ymm;
  3215. __m128i xmms[2];
  3216. sz_u64_t u64s[4];
  3217. sz_u32_t u32s[8];
  3218. sz_u16_t u16s[16];
  3219. sz_u8_t u8s[32];
  3220. } sz_u256_vec_t;
  3221. SZ_PUBLIC void sz_fill_avx2(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
  3222. for (; length >= 32; target += 32, length -= 32) _mm256_storeu_si256((__m256i *)target, _mm256_set1_epi8(value));
  3223. sz_fill_serial(target, length, value);
  3224. }
  3225. SZ_PUBLIC void sz_copy_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
  3226. for (; length >= 32; target += 32, source += 32, length -= 32)
  3227. _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
  3228. sz_copy_serial(target, source, length);
  3229. }
  3230. SZ_PUBLIC void sz_move_avx2(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
  3231. if (target < source || target >= source + length) {
  3232. for (; length >= 32; target += 32, source += 32, length -= 32)
  3233. _mm256_storeu_si256((__m256i *)target, _mm256_lddqu_si256((__m256i const *)source));
  3234. while (length--) *(target++) = *(source++);
  3235. }
  3236. else {
  3237. // Jump to the end and walk backwards.
  3238. for (target += length, source += length; length >= 32; length -= 32)
  3239. _mm256_storeu_si256((__m256i *)(target -= 32), _mm256_lddqu_si256((__m256i const *)(source -= 32)));
  3240. while (length--) *(--target) = *(--source);
  3241. }
  3242. }
  3243. SZ_PUBLIC sz_cptr_t sz_find_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  3244. int mask;
  3245. sz_u256_vec_t h_vec, n_vec;
  3246. n_vec.ymm = _mm256_set1_epi8(n[0]);
  3247. while (h_length >= 32) {
  3248. h_vec.ymm = _mm256_lddqu_si256((__m256i const *)h);
  3249. mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
  3250. if (mask) return h + sz_u32_ctz(mask);
  3251. h += 32, h_length -= 32;
  3252. }
  3253. return sz_find_byte_serial(h, h_length, n);
  3254. }
  3255. SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  3256. int mask;
  3257. sz_u256_vec_t h_vec, n_vec;
  3258. n_vec.ymm = _mm256_set1_epi8(n[0]);
  3259. while (h_length >= 32) {
  3260. h_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + h_length - 32));
  3261. mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_vec.ymm, n_vec.ymm));
  3262. if (mask) return h + h_length - 1 - sz_u32_clz(mask);
  3263. h_length -= 32;
  3264. }
  3265. return sz_rfind_byte_serial(h, h_length, n);
  3266. }
  3267. SZ_PUBLIC sz_cptr_t sz_find_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  3268. // This almost never fires, but it's better to be safe than sorry.
  3269. if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
  3270. if (n_length == 1) return sz_find_byte_avx2(h, h_length, n);
  3271. // Pick the parts of the needle that are worth comparing.
  3272. sz_size_t offset_first, offset_mid, offset_last;
  3273. _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
  3274. // Broadcast those characters into YMM registers.
  3275. int matches;
  3276. sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
  3277. n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
  3278. n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
  3279. n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
  3280. // Scan through the string.
  3281. for (; h_length >= n_length + 32; h += 32, h_length -= 32) {
  3282. h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
  3283. h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
  3284. h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
  3285. matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
  3286. _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
  3287. _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
  3288. while (matches) {
  3289. int potential_offset = sz_u32_ctz(matches);
  3290. if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
  3291. matches &= matches - 1;
  3292. }
  3293. }
  3294. return sz_find_serial(h, h_length, n, n_length);
  3295. }
  3296. SZ_PUBLIC sz_cptr_t sz_rfind_avx2(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  3297. // This almost never fires, but it's better to be safe than sorry.
  3298. if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
  3299. if (n_length == 1) return sz_rfind_byte_avx2(h, h_length, n);
  3300. // Pick the parts of the needle that are worth comparing.
  3301. sz_size_t offset_first, offset_mid, offset_last;
  3302. _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
  3303. // Broadcast those characters into YMM registers.
  3304. int matches;
  3305. sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
  3306. n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
  3307. n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
  3308. n_last_vec.ymm = _mm256_set1_epi8(n[offset_last]);
  3309. // Scan through the string.
  3310. sz_cptr_t h_reversed;
  3311. for (; h_length >= n_length + 32; h_length -= 32) {
  3312. h_reversed = h + h_length - n_length - 32 + 1;
  3313. h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
  3314. h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
  3315. h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
  3316. matches = _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
  3317. _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
  3318. _mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
  3319. while (matches) {
  3320. int potential_offset = sz_u32_clz(matches);
  3321. if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
  3322. return h + h_length - n_length - potential_offset;
  3323. matches &= ~(1 << (31 - potential_offset));
  3324. }
  3325. }
  3326. return sz_rfind_serial(h, h_length, n, n_length);
  3327. }
  3328. /**
  3329. * @brief There is no AVX2 instruction for fast multiplication of 64-bit integers.
  3330. * This implementation is coming from Agner Fog's Vector Class Library.
  3331. */
  3332. SZ_INTERNAL __m256i _mm256_mul_epu64(__m256i a, __m256i b) {
  3333. __m256i bswap = _mm256_shuffle_epi32(b, 0xB1);
  3334. __m256i prodlh = _mm256_mullo_epi32(a, bswap);
  3335. __m256i zero = _mm256_setzero_si256();
  3336. __m256i prodlh2 = _mm256_hadd_epi32(prodlh, zero);
  3337. __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2, 0x73);
  3338. __m256i prodll = _mm256_mul_epu32(a, b);
  3339. __m256i prod = _mm256_add_epi64(prodll, prodlh3);
  3340. return prod;
  3341. }
  3342. SZ_PUBLIC void sz_hashes_avx2(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
  3343. sz_hash_callback_t callback, void *callback_handle) {
  3344. if (length < window_length || !window_length) return;
  3345. if (length < 4 * window_length) {
  3346. sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
  3347. return;
  3348. }
  3349. // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
  3350. // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
  3351. sz_size_t const max_hashes = length - window_length + 1;
  3352. sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
  3353. sz_u8_t const *text_first = (sz_u8_t const *)start;
  3354. sz_u8_t const *text_second = text_first + min_hashes_per_thread;
  3355. sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
  3356. sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
  3357. sz_u8_t const *text_end = text_first + length;
  3358. // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
  3359. sz_u64_t prime_power_low = 1, prime_power_high = 1;
  3360. for (sz_size_t i = 0; i + 1 < window_length; ++i)
  3361. prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
  3362. prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
  3363. // Broadcast the constants into the registers.
  3364. sz_u256_vec_t prime_vec, golden_ratio_vec;
  3365. sz_u256_vec_t base_low_vec, base_high_vec, prime_power_low_vec, prime_power_high_vec, shift_high_vec;
  3366. base_low_vec.ymm = _mm256_set1_epi64x(31ull);
  3367. base_high_vec.ymm = _mm256_set1_epi64x(257ull);
  3368. shift_high_vec.ymm = _mm256_set1_epi64x(77ull);
  3369. prime_vec.ymm = _mm256_set1_epi64x(SZ_U64_MAX_PRIME);
  3370. golden_ratio_vec.ymm = _mm256_set1_epi64x(11400714819323198485ull);
  3371. prime_power_low_vec.ymm = _mm256_set1_epi64x(prime_power_low);
  3372. prime_power_high_vec.ymm = _mm256_set1_epi64x(prime_power_high);
  3373. // Compute the initial hash values for every one of the four windows.
  3374. sz_u256_vec_t hash_low_vec, hash_high_vec, hash_mix_vec, chars_low_vec, chars_high_vec;
  3375. hash_low_vec.ymm = _mm256_setzero_si256();
  3376. hash_high_vec.ymm = _mm256_setzero_si256();
  3377. for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
  3378. ++text_first, ++text_second, ++text_third, ++text_fourth) {
  3379. // 1. Multiply the hashes by the base.
  3380. hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
  3381. hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
  3382. // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
  3383. // `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
  3384. chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
  3385. chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
  3386. // 3. Add the incoming characters.
  3387. hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
  3388. hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
  3389. // 4. Compute the modulo. Assuming there are only 59 values between our prime
  3390. // and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
  3391. hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
  3392. _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
  3393. hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
  3394. _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
  3395. }
  3396. // 5. Compute the hash mix, that will be used to index into the fingerprint.
  3397. // This includes a serial step at the end.
  3398. hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
  3399. hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
  3400. hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
  3401. callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
  3402. callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
  3403. callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
  3404. callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
  3405. // Now repeat that operation for the remaining characters, discarding older characters.
  3406. sz_size_t cycle = 1;
  3407. sz_size_t const step_mask = step - 1;
  3408. for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
  3409. // 0. Load again the four characters we are dropping, shift them, and subtract.
  3410. chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[-window_length], text_third[-window_length],
  3411. text_second[-window_length], text_first[-window_length]);
  3412. chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
  3413. hash_low_vec.ymm =
  3414. _mm256_sub_epi64(hash_low_vec.ymm, _mm256_mul_epu64(chars_low_vec.ymm, prime_power_low_vec.ymm));
  3415. hash_high_vec.ymm =
  3416. _mm256_sub_epi64(hash_high_vec.ymm, _mm256_mul_epu64(chars_high_vec.ymm, prime_power_high_vec.ymm));
  3417. // 1. Multiply the hashes by the base.
  3418. hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, base_low_vec.ymm);
  3419. hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, base_high_vec.ymm);
  3420. // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
  3421. // `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
  3422. chars_low_vec.ymm = _mm256_set_epi64x(text_fourth[0], text_third[0], text_second[0], text_first[0]);
  3423. chars_high_vec.ymm = _mm256_add_epi8(chars_low_vec.ymm, shift_high_vec.ymm);
  3424. // 3. Add the incoming characters.
  3425. hash_low_vec.ymm = _mm256_add_epi64(hash_low_vec.ymm, chars_low_vec.ymm);
  3426. hash_high_vec.ymm = _mm256_add_epi64(hash_high_vec.ymm, chars_high_vec.ymm);
  3427. // 4. Compute the modulo. Assuming there are only 59 values between our prime
  3428. // and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
  3429. hash_low_vec.ymm = _mm256_blendv_epi8(hash_low_vec.ymm, _mm256_sub_epi64(hash_low_vec.ymm, prime_vec.ymm),
  3430. _mm256_cmpgt_epi64(hash_low_vec.ymm, prime_vec.ymm));
  3431. hash_high_vec.ymm = _mm256_blendv_epi8(hash_high_vec.ymm, _mm256_sub_epi64(hash_high_vec.ymm, prime_vec.ymm),
  3432. _mm256_cmpgt_epi64(hash_high_vec.ymm, prime_vec.ymm));
  3433. // 5. Compute the hash mix, that will be used to index into the fingerprint.
  3434. // This includes a serial step at the end.
  3435. hash_low_vec.ymm = _mm256_mul_epu64(hash_low_vec.ymm, golden_ratio_vec.ymm);
  3436. hash_high_vec.ymm = _mm256_mul_epu64(hash_high_vec.ymm, golden_ratio_vec.ymm);
  3437. hash_mix_vec.ymm = _mm256_xor_si256(hash_low_vec.ymm, hash_high_vec.ymm);
  3438. if ((cycle & step_mask) == 0) {
  3439. callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
  3440. callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
  3441. callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
  3442. callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
  3443. }
  3444. }
  3445. }
  3446. #pragma clang attribute pop
  3447. #pragma GCC pop_options
  3448. #endif
  3449. #pragma endregion
  3450. /*
  3451. * @brief AVX-512 implementation of the string search algorithms.
  3452. *
  3453. * Different subsets of AVX-512 were introduced in different years:
  3454. * * 2017 SkyLake: F, CD, ER, PF, VL, DQ, BW
  3455. * * 2018 CannonLake: IFMA, VBMI
  3456. * * 2019 IceLake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES
  3457. * * 2020 TigerLake: VP2INTERSECT
  3458. */
  3459. #pragma region AVX-512 Implementation
  3460. #if SZ_USE_X86_AVX512
  3461. #pragma GCC push_options
  3462. #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
  3463. #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
  3464. #include <immintrin.h>
  3465. /**
  3466. * @brief Helper structure to simplify work with 512-bit registers.
  3467. */
  3468. typedef union sz_u512_vec_t {
  3469. __m512i zmm;
  3470. __m256i ymms[2];
  3471. __m128i xmms[4];
  3472. sz_u64_t u64s[8];
  3473. sz_u32_t u32s[16];
  3474. sz_u16_t u16s[32];
  3475. sz_u8_t u8s[64];
  3476. sz_i64_t i64s[8];
  3477. sz_i32_t i32s[16];
  3478. } sz_u512_vec_t;
  3479. SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
  3480. // The simplest approach to compute this if we know that `n` is blow or equal 64:
  3481. // return (1ull << n) - 1;
  3482. // A slightly more complex approach, if we don't know that `n` is under 64:
  3483. return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n < 64 ? n : 64);
  3484. }
  3485. SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) {
  3486. // The simplest approach to compute this if we know that `n` is blow or equal 32:
  3487. // return (1ull << n) - 1;
  3488. // A slightly more complex approach, if we don't know that `n` is under 32:
  3489. return _bzhi_u32(0xFFFFFFFF, n < 32 ? n : 32);
  3490. }
  3491. SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) {
  3492. // The simplest approach to compute this if we know that `n` is blow or equal 16:
  3493. // return (1ull << n) - 1;
  3494. // A slightly more complex approach, if we don't know that `n` is under 16:
  3495. return _bzhi_u32(0xFFFFFFFF, n < 16 ? n : 16);
  3496. }
  3497. SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) {
  3498. // The simplest approach to compute this if we know that `n` is blow or equal 64:
  3499. // return (1ull << n) - 1;
  3500. // A slightly more complex approach, if we don't know that `n` is under 64:
  3501. return _bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
  3502. }
  3503. SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
  3504. sz_u512_vec_t a_vec, b_vec;
  3505. __mmask64 a_mask, b_mask, mask_not_equal;
  3506. // The rare case, when both string are very long.
  3507. while ((a_length >= 64) & (b_length >= 64)) {
  3508. a_vec.zmm = _mm512_loadu_epi8(a);
  3509. b_vec.zmm = _mm512_loadu_epi8(b);
  3510. mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
  3511. if (mask_not_equal != 0) {
  3512. int first_diff = _tzcnt_u64(mask_not_equal);
  3513. char a_char = a[first_diff];
  3514. char b_char = b[first_diff];
  3515. return _sz_order_scalars(a_char, b_char);
  3516. }
  3517. a += 64, b += 64, a_length -= 64, b_length -= 64;
  3518. }
  3519. // In most common scenarios at least one of the strings is under 64 bytes.
  3520. if (a_length | b_length) {
  3521. a_mask = _sz_u64_clamp_mask_until(a_length);
  3522. b_mask = _sz_u64_clamp_mask_until(b_length);
  3523. a_vec.zmm = _mm512_maskz_loadu_epi8(a_mask, a);
  3524. b_vec.zmm = _mm512_maskz_loadu_epi8(b_mask, b);
  3525. // The AVX-512 `_mm512_mask_cmpneq_epi8_mask` intrinsics are generally handy in such environments.
  3526. // They, however, have latency 3 on most modern CPUs. Using AVX2: `_mm256_cmpeq_epi8` would have
  3527. // been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
  3528. mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
  3529. if (mask_not_equal != 0) {
  3530. int first_diff = _tzcnt_u64(mask_not_equal);
  3531. char a_char = a[first_diff];
  3532. char b_char = b[first_diff];
  3533. return _sz_order_scalars(a_char, b_char);
  3534. }
  3535. else
  3536. // From logic perspective, the hardest cases are "abc\0" and "abc".
  3537. // The result must be `sz_greater_k`, as the latter is shorter.
  3538. return _sz_order_scalars(a_length, b_length);
  3539. }
  3540. else
  3541. return sz_equal_k;
  3542. }
  3543. SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
  3544. __mmask64 mask;
  3545. sz_u512_vec_t a_vec, b_vec;
  3546. while (length >= 64) {
  3547. a_vec.zmm = _mm512_loadu_epi8(a);
  3548. b_vec.zmm = _mm512_loadu_epi8(b);
  3549. mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
  3550. if (mask != 0) return sz_false_k;
  3551. a += 64, b += 64, length -= 64;
  3552. }
  3553. if (length) {
  3554. mask = _sz_u64_mask_until(length);
  3555. a_vec.zmm = _mm512_maskz_loadu_epi8(mask, a);
  3556. b_vec.zmm = _mm512_maskz_loadu_epi8(mask, b);
  3557. // Reuse the same `mask` variable to find the bit that doesn't match
  3558. mask = _mm512_mask_cmpneq_epi8_mask(mask, a_vec.zmm, b_vec.zmm);
  3559. return (sz_bool_t)(mask == 0);
  3560. }
  3561. else
  3562. return sz_true_k;
  3563. }
  3564. SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
  3565. for (; length >= 64; target += 64, length -= 64) _mm512_storeu_epi8(target, _mm512_set1_epi8(value));
  3566. // At this point the length is guaranteed to be under 64.
  3567. _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), _mm512_set1_epi8(value));
  3568. }
  3569. SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
  3570. for (; length >= 64; target += 64, source += 64, length -= 64)
  3571. _mm512_storeu_epi8(target, _mm512_loadu_epi8(source));
  3572. // At this point the length is guaranteed to be under 64.
  3573. __mmask64 mask = _sz_u64_mask_until(length);
  3574. _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
  3575. }
  3576. SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
  3577. if (target < source || target >= source + length) {
  3578. for (; length >= 64; target += 64, source += 64, length -= 64)
  3579. _mm512_storeu_epi8(target, _mm512_loadu_epi8(source));
  3580. // At this point the length is guaranteed to be under 64.
  3581. __mmask64 mask = _sz_u64_mask_until(length);
  3582. _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source));
  3583. }
  3584. else {
  3585. // Jump to the end and walk backwards.
  3586. for (target += length, source += length; length >= 64; length -= 64)
  3587. _mm512_storeu_epi8(target -= 64, _mm512_loadu_epi8(source -= 64));
  3588. // At this point the length is guaranteed to be under 64.
  3589. __mmask64 mask = _sz_u64_mask_until(length);
  3590. _mm512_mask_storeu_epi8(target - length, mask, _mm512_maskz_loadu_epi8(mask, source - length));
  3591. }
  3592. }
  3593. SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  3594. __mmask64 mask;
  3595. sz_u512_vec_t h_vec, n_vec;
  3596. n_vec.zmm = _mm512_set1_epi8(n[0]);
  3597. while (h_length >= 64) {
  3598. h_vec.zmm = _mm512_loadu_epi8(h);
  3599. mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
  3600. if (mask) return h + sz_u64_ctz(mask);
  3601. h += 64, h_length -= 64;
  3602. }
  3603. if (h_length) {
  3604. mask = _sz_u64_mask_until(h_length);
  3605. h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
  3606. // Reuse the same `mask` variable to find the bit that doesn't match
  3607. mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
  3608. if (mask) return h + sz_u64_ctz(mask);
  3609. }
  3610. return SZ_NULL_CHAR;
  3611. }
  3612. SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  3613. // This almost never fires, but it's better to be safe than sorry.
  3614. if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
  3615. if (n_length == 1) return sz_find_byte_avx512(h, h_length, n);
  3616. // Pick the parts of the needle that are worth comparing.
  3617. sz_size_t offset_first, offset_mid, offset_last;
  3618. _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
  3619. // Broadcast those characters into ZMM registers.
  3620. __mmask64 matches;
  3621. __mmask64 mask;
  3622. sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
  3623. n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
  3624. n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
  3625. n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
  3626. // Scan through the string.
  3627. for (; h_length >= n_length + 64; h += 64, h_length -= 64) {
  3628. h_first_vec.zmm = _mm512_loadu_epi8(h + offset_first);
  3629. h_mid_vec.zmm = _mm512_loadu_epi8(h + offset_mid);
  3630. h_last_vec.zmm = _mm512_loadu_epi8(h + offset_last);
  3631. matches = _kand_mask64(_kand_mask64( // Intersect the masks
  3632. _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
  3633. _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
  3634. _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
  3635. while (matches) {
  3636. int potential_offset = sz_u64_ctz(matches);
  3637. if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
  3638. matches &= matches - 1;
  3639. }
  3640. // TODO: If the last character contains a bad byte, we can reposition the start of the next iteration.
  3641. // This will be very helpful for very long needles.
  3642. }
  3643. // The "tail" of the function uses masked loads to process the remaining bytes.
  3644. {
  3645. mask = _sz_u64_mask_until(h_length - n_length + 1);
  3646. h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
  3647. h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
  3648. h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
  3649. matches = _kand_mask64(_kand_mask64( // Intersect the masks
  3650. _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
  3651. _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
  3652. _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
  3653. while (matches) {
  3654. int potential_offset = sz_u64_ctz(matches);
  3655. if (n_length <= 3 || sz_equal_avx512(h + potential_offset, n, n_length)) return h + potential_offset;
  3656. matches &= matches - 1;
  3657. }
  3658. }
  3659. return SZ_NULL_CHAR;
  3660. }
  3661. SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  3662. __mmask64 mask;
  3663. sz_u512_vec_t h_vec, n_vec;
  3664. n_vec.zmm = _mm512_set1_epi8(n[0]);
  3665. while (h_length >= 64) {
  3666. h_vec.zmm = _mm512_loadu_epi8(h + h_length - 64);
  3667. mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
  3668. if (mask) return h + h_length - 1 - sz_u64_clz(mask);
  3669. h_length -= 64;
  3670. }
  3671. if (h_length) {
  3672. mask = _sz_u64_mask_until(h_length);
  3673. h_vec.zmm = _mm512_maskz_loadu_epi8(mask, h);
  3674. // Reuse the same `mask` variable to find the bit that doesn't match
  3675. mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
  3676. if (mask) return h + 64 - sz_u64_clz(mask) - 1;
  3677. }
  3678. return SZ_NULL_CHAR;
  3679. }
  3680. SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  3681. // This almost never fires, but it's better to be safe than sorry.
  3682. if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
  3683. if (n_length == 1) return sz_rfind_byte_avx512(h, h_length, n);
  3684. // Pick the parts of the needle that are worth comparing.
  3685. sz_size_t offset_first, offset_mid, offset_last;
  3686. _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
  3687. // Broadcast those characters into ZMM registers.
  3688. __mmask64 mask;
  3689. __mmask64 matches;
  3690. sz_u512_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
  3691. n_first_vec.zmm = _mm512_set1_epi8(n[offset_first]);
  3692. n_mid_vec.zmm = _mm512_set1_epi8(n[offset_mid]);
  3693. n_last_vec.zmm = _mm512_set1_epi8(n[offset_last]);
  3694. // Scan through the string.
  3695. sz_cptr_t h_reversed;
  3696. for (; h_length >= n_length + 64; h_length -= 64) {
  3697. h_reversed = h + h_length - n_length - 64 + 1;
  3698. h_first_vec.zmm = _mm512_loadu_epi8(h_reversed + offset_first);
  3699. h_mid_vec.zmm = _mm512_loadu_epi8(h_reversed + offset_mid);
  3700. h_last_vec.zmm = _mm512_loadu_epi8(h_reversed + offset_last);
  3701. matches = _kand_mask64(_kand_mask64( // Intersect the masks
  3702. _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
  3703. _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
  3704. _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
  3705. while (matches) {
  3706. int potential_offset = sz_u64_clz(matches);
  3707. if (n_length <= 3 || sz_equal_avx512(h + h_length - n_length - potential_offset, n, n_length))
  3708. return h + h_length - n_length - potential_offset;
  3709. sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
  3710. "The bit must be set before we squash it");
  3711. matches &= ~((sz_u64_t)1 << (63 - potential_offset));
  3712. }
  3713. }
  3714. // The "tail" of the function uses masked loads to process the remaining bytes.
  3715. {
  3716. mask = _sz_u64_mask_until(h_length - n_length + 1);
  3717. h_first_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_first);
  3718. h_mid_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_mid);
  3719. h_last_vec.zmm = _mm512_maskz_loadu_epi8(mask, h + offset_last);
  3720. matches = _kand_mask64(_kand_mask64( // Intersect the masks
  3721. _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm),
  3722. _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)),
  3723. _mm512_cmpeq_epi8_mask(h_last_vec.zmm, n_last_vec.zmm));
  3724. while (matches) {
  3725. int potential_offset = sz_u64_clz(matches);
  3726. if (n_length <= 3 || sz_equal_avx512(h + 64 - potential_offset - 1, n, n_length))
  3727. return h + 64 - potential_offset - 1;
  3728. sz_assert((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
  3729. "The bit must be set before we squash it");
  3730. matches &= ~((sz_u64_t)1 << (63 - potential_offset));
  3731. }
  3732. }
  3733. return SZ_NULL_CHAR;
  3734. }
  3735. SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
  3736. sz_cptr_t shorter, sz_size_t shorter_length, //
  3737. sz_cptr_t longer, sz_size_t longer_length, //
  3738. sz_size_t bound, sz_memory_allocator_t *alloc) {
  3739. // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
  3740. sz_memory_allocator_t global_alloc;
  3741. if (!alloc) {
  3742. sz_memory_allocator_init_default(&global_alloc);
  3743. alloc = &global_alloc;
  3744. }
  3745. // TODO: Generalize!
  3746. sz_size_t max_length = 256u * 256u;
  3747. sz_assert(!bound && "For bounded search the method should only evaluate one band of the matrix.");
  3748. sz_assert(shorter_length == longer_length && "The method hasn't been generalized to different length inputs yet.");
  3749. sz_assert(shorter_length < max_length && "The length must fit into 16-bit integer. Otherwise use serial variant.");
  3750. sz_unused(longer_length && bound && max_length);
  3751. // We are going to store 3 diagonals of the matrix.
  3752. // The length of the longest (main) diagonal would be `n = (shorter_length + 1)`.
  3753. sz_size_t n = shorter_length + 1;
  3754. // Unlike the serial version, we also want to avoid reverse-order iteration over teh shorter string.
  3755. // So let's allocate a bit more memory and reverse-export our shorter string into that buffer.
  3756. sz_size_t buffer_length = sizeof(sz_u16_t) * n * 3 + shorter_length;
  3757. sz_u16_t *distances = (sz_u16_t *)alloc->allocate(buffer_length, alloc->handle);
  3758. if (!distances) return SZ_SIZE_MAX;
  3759. sz_u16_t *previous_distances = distances;
  3760. sz_u16_t *current_distances = previous_distances + n;
  3761. sz_u16_t *next_distances = current_distances + n;
  3762. sz_ptr_t shorter_reversed = (sz_ptr_t)(next_distances + n);
  3763. // Export the reversed string into the buffer.
  3764. for (sz_size_t i = 0; i != shorter_length; ++i) shorter_reversed[i] = shorter[shorter_length - 1 - i];
  3765. // Initialize the first two diagonals:
  3766. previous_distances[0] = 0;
  3767. current_distances[0] = current_distances[1] = 1;
  3768. // Using ZMM registers, we can process 32x 16-bit values at once,
  3769. // storing 16 bytes of each string in YMM registers.
  3770. sz_u512_vec_t insertions_vec, deletions_vec, substitutions_vec, next_vec;
  3771. sz_u512_vec_t ones_u16_vec;
  3772. ones_u16_vec.zmm = _mm512_set1_epi16(1);
  3773. // This is a mixed-precision implementation, using 8-bit representations for part of the operations.
  3774. // Even there, in case `SZ_USE_X86_AVX2=0`, let's use the `sz_u512_vec_t` type, addressing the first YMM halfs.
  3775. sz_u512_vec_t shorter_vec, longer_vec;
  3776. sz_u512_vec_t ones_u8_vec;
  3777. ones_u8_vec.ymms[0] = _mm256_set1_epi8(1);
  3778. // Progress through the upper triangle of the Levenshtein matrix.
  3779. sz_size_t next_skew_diagonal_index = 2;
  3780. for (; next_skew_diagonal_index != n; ++next_skew_diagonal_index) {
  3781. sz_size_t const next_skew_diagonal_length = next_skew_diagonal_index + 1;
  3782. for (sz_size_t i = 0; i + 2 < next_skew_diagonal_length;) {
  3783. sz_size_t remaining_length = next_skew_diagonal_length - i - 2;
  3784. sz_size_t register_length = remaining_length < 32 ? remaining_length : 32;
  3785. sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
  3786. longer_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, longer + i);
  3787. // Our original code addressed the shorter string `[next_skew_diagonal_index - i - 2]` for growing `i`.
  3788. // If the `shorter` string was reversed, the `[next_skew_diagonal_index - i - 2]` would
  3789. // be equal to `[shorter_length - 1 - next_skew_diagonal_index + i + 2]`.
  3790. // Which simplified would be equal to `[shorter_length - next_skew_diagonal_index + i + 1]`.
  3791. shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(
  3792. remaining_length_mask, shorter_reversed + shorter_length - next_skew_diagonal_index + i + 1);
  3793. // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
  3794. // to get the result as a vector, instead of a bitmask. Adding 1 to every scalar we can overflow
  3795. // transforming from {0xFF, 0} values to {0, 1} values - exactly what we need. Then - upcast to 16-bit.
  3796. substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
  3797. _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
  3798. substitutions_vec.zmm = _mm512_add_epi16( //
  3799. substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
  3800. // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
  3801. // than rotate the bytes in the ZMM register.
  3802. insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
  3803. deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
  3804. // First get the minimum of insertions and deletions.
  3805. next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
  3806. next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
  3807. _mm512_mask_storeu_epi16(next_distances + i + 1, remaining_length_mask, next_vec.zmm);
  3808. i += register_length;
  3809. }
  3810. // Don't forget to populate the first row and the fiest column of the Levenshtein matrix.
  3811. next_distances[0] = next_distances[next_skew_diagonal_length - 1] = next_skew_diagonal_index;
  3812. // Perform a circular rotarion of those buffers, to reuse the memory.
  3813. sz_u16_t *temporary = previous_distances;
  3814. previous_distances = current_distances;
  3815. current_distances = next_distances;
  3816. next_distances = temporary;
  3817. }
  3818. // By now we've scanned through the upper triangle of the matrix, where each subsequent iteration results in a
  3819. // larger diagonal. From now onwards, we will be shrinking. Instead of adding value equal to the skewed diagonal
  3820. // index on either side, we will be cropping those values out.
  3821. sz_size_t total_diagonals = n + n - 1;
  3822. for (; next_skew_diagonal_index != total_diagonals; ++next_skew_diagonal_index) {
  3823. sz_size_t const next_skew_diagonal_length = total_diagonals - next_skew_diagonal_index;
  3824. for (sz_size_t i = 0; i != next_skew_diagonal_length;) {
  3825. sz_size_t remaining_length = next_skew_diagonal_length - i;
  3826. sz_size_t register_length = remaining_length < 32 ? remaining_length : 32;
  3827. sz_u32_t remaining_length_mask = _bzhi_u32(0xFFFFFFFFu, register_length);
  3828. longer_vec.ymms[0] =
  3829. _mm256_maskz_loadu_epi8(remaining_length_mask, longer + next_skew_diagonal_index - n + i);
  3830. // Our original code addressed the shorter string `[shorter_length - 1 - i]` for growing `i`.
  3831. // If the `shorter` string was reversed, the `[shorter_length - 1 - i]` would
  3832. // be equal to `[shorter_length - 1 - shorter_length + 1 + i]`.
  3833. // Which simplified would be equal to just `[i]`. Beautiful!
  3834. shorter_vec.ymms[0] = _mm256_maskz_loadu_epi8(remaining_length_mask, shorter_reversed + i);
  3835. // For substitutions, perform the equality comparison using AVX2 instead of AVX-512
  3836. // to get the result as a vector, instead of a bitmask. The compare it against the accumulated
  3837. // substitution costs.
  3838. substitutions_vec.zmm = _mm512_cvtepi8_epi16( //
  3839. _mm256_add_epi8(_mm256_cmpeq_epi8(longer_vec.ymms[0], shorter_vec.ymms[0]), ones_u8_vec.ymms[0]));
  3840. substitutions_vec.zmm = _mm512_add_epi16( //
  3841. substitutions_vec.zmm, _mm512_maskz_loadu_epi16(remaining_length_mask, previous_distances + i));
  3842. // For insertions and deletions, on modern hardware, it's faster to issue two separate loads,
  3843. // than rotate the bytes in the ZMM register.
  3844. insertions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i);
  3845. deletions_vec.zmm = _mm512_maskz_loadu_epi16(remaining_length_mask, current_distances + i + 1);
  3846. // First get the minimum of insertions and deletions.
  3847. next_vec.zmm = _mm512_add_epi16(_mm512_min_epu16(insertions_vec.zmm, deletions_vec.zmm), ones_u16_vec.zmm);
  3848. next_vec.zmm = _mm512_min_epu16(next_vec.zmm, substitutions_vec.zmm);
  3849. _mm512_mask_storeu_epi16(next_distances + i, remaining_length_mask, next_vec.zmm);
  3850. i += register_length;
  3851. }
  3852. // Perform a circular rotarion of those buffers, to reuse the memory, this time, with a shift,
  3853. // dropping the first element in the current array.
  3854. sz_u16_t *temporary = previous_distances;
  3855. previous_distances = current_distances + 1;
  3856. current_distances = next_distances;
  3857. next_distances = temporary;
  3858. }
  3859. // Cache scalar before `free` call.
  3860. sz_size_t result = current_distances[0];
  3861. alloc->free(distances, buffer_length, alloc->handle);
  3862. return result;
  3863. }
  3864. SZ_INTERNAL sz_size_t sz_edit_distance_avx512( //
  3865. sz_cptr_t shorter, sz_size_t shorter_length, //
  3866. sz_cptr_t longer, sz_size_t longer_length, //
  3867. sz_size_t bound, sz_memory_allocator_t *alloc) {
  3868. if (shorter_length == longer_length && !bound && shorter_length && shorter_length < 256u * 256u)
  3869. return _sz_edit_distance_skewed_diagonals_upto65k_avx512(shorter, shorter_length, longer, longer_length, bound,
  3870. alloc);
  3871. else
  3872. return sz_edit_distance_serial(shorter, shorter_length, longer, longer_length, bound, alloc);
  3873. }
  3874. #pragma clang attribute pop
  3875. #pragma GCC pop_options
  3876. #pragma GCC push_options
  3877. #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "bmi", "bmi2")
  3878. #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,bmi,bmi2"))), \
  3879. apply_to = function)
  3880. SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_size_t step, //
  3881. sz_hash_callback_t callback, void *callback_handle) {
  3882. if (length < window_length || !window_length) return;
  3883. if (length < 4 * window_length) {
  3884. sz_hashes_serial(start, length, window_length, step, callback, callback_handle);
  3885. return;
  3886. }
  3887. // Using AVX2, we can perform 4 long integer multiplications and additions within one register.
  3888. // So let's slice the entire string into 4 overlapping windows, to slide over them in parallel.
  3889. sz_size_t const max_hashes = length - window_length + 1;
  3890. sz_size_t const min_hashes_per_thread = max_hashes / 4; // At most one sequence can overlap between 2 threads.
  3891. sz_u8_t const *text_first = (sz_u8_t const *)start;
  3892. sz_u8_t const *text_second = text_first + min_hashes_per_thread;
  3893. sz_u8_t const *text_third = text_first + min_hashes_per_thread * 2;
  3894. sz_u8_t const *text_fourth = text_first + min_hashes_per_thread * 3;
  3895. sz_u8_t const *text_end = text_first + length;
  3896. // Broadcast the global constants into the registers.
  3897. // Both high and low hashes will work with the same prime and golden ratio.
  3898. sz_u512_vec_t prime_vec, golden_ratio_vec;
  3899. prime_vec.zmm = _mm512_set1_epi64(SZ_U64_MAX_PRIME);
  3900. golden_ratio_vec.zmm = _mm512_set1_epi64(11400714819323198485ull);
  3901. // Prepare the `prime ^ window_length` values, that we are going to use for modulo arithmetic.
  3902. sz_u64_t prime_power_low = 1, prime_power_high = 1;
  3903. for (sz_size_t i = 0; i + 1 < window_length; ++i)
  3904. prime_power_low = (prime_power_low * 31ull) % SZ_U64_MAX_PRIME,
  3905. prime_power_high = (prime_power_high * 257ull) % SZ_U64_MAX_PRIME;
  3906. // We will be evaluating 4 offsets at a time with 2 different hash functions.
  3907. // We can fit all those 8 state variables in each of the following ZMM registers.
  3908. sz_u512_vec_t base_vec, prime_power_vec, shift_vec;
  3909. base_vec.zmm = _mm512_set_epi64(31ull, 31ull, 31ull, 31ull, 257ull, 257ull, 257ull, 257ull);
  3910. shift_vec.zmm = _mm512_set_epi64(0ull, 0ull, 0ull, 0ull, 77ull, 77ull, 77ull, 77ull);
  3911. prime_power_vec.zmm = _mm512_set_epi64(prime_power_low, prime_power_low, prime_power_low, prime_power_low,
  3912. prime_power_high, prime_power_high, prime_power_high, prime_power_high);
  3913. // Compute the initial hash values for every one of the four windows.
  3914. sz_u512_vec_t hash_vec, chars_vec;
  3915. hash_vec.zmm = _mm512_setzero_si512();
  3916. for (sz_u8_t const *prefix_end = text_first + window_length; text_first < prefix_end;
  3917. ++text_first, ++text_second, ++text_third, ++text_fourth) {
  3918. // 1. Multiply the hashes by the base.
  3919. hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
  3920. // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
  3921. // `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`...
  3922. chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
  3923. text_fourth[0], text_third[0], text_second[0], text_first[0]);
  3924. chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
  3925. // 3. Add the incoming characters.
  3926. hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
  3927. // 4. Compute the modulo. Assuming there are only 59 values between our prime
  3928. // and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
  3929. hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
  3930. _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
  3931. }
  3932. // 5. Compute the hash mix, that will be used to index into the fingerprint.
  3933. // This includes a serial step at the end.
  3934. sz_u512_vec_t hash_mix_vec;
  3935. hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
  3936. hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
  3937. _mm512_extracti64x4_epi64(hash_mix_vec.zmm, 0));
  3938. callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
  3939. callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
  3940. callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
  3941. callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
  3942. // Now repeat that operation for the remaining characters, discarding older characters.
  3943. sz_size_t cycle = 1;
  3944. sz_size_t step_mask = step - 1;
  3945. for (; text_fourth != text_end; ++text_first, ++text_second, ++text_third, ++text_fourth, ++cycle) {
  3946. // 0. Load again the four characters we are dropping, shift them, and subtract.
  3947. chars_vec.zmm = _mm512_set_epi64(text_fourth[-window_length], text_third[-window_length],
  3948. text_second[-window_length], text_first[-window_length], //
  3949. text_fourth[-window_length], text_third[-window_length],
  3950. text_second[-window_length], text_first[-window_length]);
  3951. chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
  3952. hash_vec.zmm = _mm512_sub_epi64(hash_vec.zmm, _mm512_mullo_epi64(chars_vec.zmm, prime_power_vec.zmm));
  3953. // 1. Multiply the hashes by the base.
  3954. hash_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, base_vec.zmm);
  3955. // 2. Load the four characters from `text_first`, `text_first + max_hashes_per_thread`,
  3956. // `text_first + max_hashes_per_thread * 2`, `text_first + max_hashes_per_thread * 3`.
  3957. chars_vec.zmm = _mm512_set_epi64(text_fourth[0], text_third[0], text_second[0], text_first[0], //
  3958. text_fourth[0], text_third[0], text_second[0], text_first[0]);
  3959. chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
  3960. // ... and prefetch the next four characters into Level 2 or higher.
  3961. _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
  3962. _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
  3963. _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
  3964. _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
  3965. // 3. Add the incoming characters.
  3966. hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
  3967. // 4. Compute the modulo. Assuming there are only 59 values between our prime
  3968. // and the 2^64 value, we can simply compute the modulo by conditionally subtracting the prime.
  3969. hash_vec.zmm = _mm512_mask_blend_epi8(_mm512_cmpgt_epi64_mask(hash_vec.zmm, prime_vec.zmm), hash_vec.zmm,
  3970. _mm512_sub_epi64(hash_vec.zmm, prime_vec.zmm));
  3971. // 5. Compute the hash mix, that will be used to index into the fingerprint.
  3972. // This includes a serial step at the end.
  3973. hash_mix_vec.zmm = _mm512_mullo_epi64(hash_vec.zmm, golden_ratio_vec.zmm);
  3974. hash_mix_vec.ymms[0] = _mm256_xor_si256(_mm512_extracti64x4_epi64(hash_mix_vec.zmm, 1), //
  3975. _mm512_castsi512_si256(hash_mix_vec.zmm));
  3976. if ((cycle & step_mask) == 0) {
  3977. callback((sz_cptr_t)text_first, window_length, hash_mix_vec.u64s[0], callback_handle);
  3978. callback((sz_cptr_t)text_second, window_length, hash_mix_vec.u64s[1], callback_handle);
  3979. callback((sz_cptr_t)text_third, window_length, hash_mix_vec.u64s[2], callback_handle);
  3980. callback((sz_cptr_t)text_fourth, window_length, hash_mix_vec.u64s[3], callback_handle);
  3981. }
  3982. }
  3983. }
  3984. #pragma clang attribute pop
  3985. #pragma GCC pop_options
  3986. #pragma GCC push_options
  3987. #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512vbmi", "bmi", "bmi2", "gfni")
  3988. #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,bmi,bmi2,gfni"))), \
  3989. apply_to = function)
  3990. SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
  3991. sz_size_t load_length;
  3992. __mmask32 load_mask, matches_mask;
  3993. // To store the set in the register we need just 256 bits, but the `VPERMB` instruction
  3994. // we are going to invoke is surprisingly cheaper on ZMM registers.
  3995. sz_u512_vec_t text_vec, filter_vec;
  3996. filter_vec.ymms[0] = _mm256_loadu_epi64(&filter->_u64s[0]);
  3997. // We are going to view the `filter` at 8-bit word granularity.
  3998. sz_u512_vec_t filter_slice_offsets_vec;
  3999. sz_u512_vec_t filter_slice_vec;
  4000. sz_u512_vec_t offset_within_slice_vec;
  4001. sz_u512_vec_t mask_in_filter_slice_vec;
  4002. sz_u512_vec_t matches_vec;
  4003. while (length) {
  4004. // For every byte:
  4005. // 1. Find corresponding word in a set.
  4006. // 2. Produce a bitmask to check against that word.
  4007. load_length = sz_min_of_two(length, 32);
  4008. load_mask = _sz_u64_mask_until(load_length);
  4009. text_vec.ymms[0] = _mm256_maskz_loadu_epi8(load_mask, text);
  4010. // To shift right every byte by 3 bits we can use the GF2 affine transformations.
  4011. // https://wunkolo.github.io/post/2020/11/gf2p8affineqb-int8-shifting/
  4012. // After next line, all 8-bit offsets in the `filter_slice_offsets_vec` should be under 32.
  4013. filter_slice_offsets_vec.ymms[0] =
  4014. _mm256_gf2p8affine_epi64_epi8(text_vec.ymms[0], _mm256_set1_epi64x(0x0102040810204080ull << (3 * 8)), 0);
  4015. // After next line, `filter_slice_vec` will contain the right word from the set,
  4016. // needed to filter the presence of the byte in the set.
  4017. filter_slice_vec.ymms[0] = _mm256_permutexvar_epi8(filter_slice_offsets_vec.ymms[0], filter_vec.ymms[0]);
  4018. // After next line, all 8-bit offsets in the `filter_slice_offsets_vec` should be under 8.
  4019. offset_within_slice_vec.ymms[0] = _mm256_and_si256(text_vec.ymms[0], _mm256_set1_epi64x(0x0707070707070707ull));
  4020. // Instead of performing one more Galois Field operation, we can upcast to 16-bit integers,
  4021. // and perform the fift and intersection there.
  4022. filter_slice_vec.zmm = _mm512_cvtepi8_epi16(filter_slice_vec.ymms[0]);
  4023. offset_within_slice_vec.zmm = _mm512_cvtepi8_epi16(offset_within_slice_vec.ymms[0]);
  4024. mask_in_filter_slice_vec.zmm = _mm512_sllv_epi16(_mm512_set1_epi16(1), offset_within_slice_vec.zmm);
  4025. matches_vec.zmm = _mm512_and_si512(filter_slice_vec.zmm, mask_in_filter_slice_vec.zmm);
  4026. matches_mask = _mm512_mask_cmpneq_epi16_mask(load_mask, matches_vec.zmm, _mm512_setzero_si512());
  4027. if (matches_mask) {
  4028. int offset = sz_u32_ctz(matches_mask);
  4029. return text + offset;
  4030. }
  4031. else { text += load_length, length -= load_length; }
  4032. }
  4033. return SZ_NULL_CHAR;
  4034. }
  4035. SZ_PUBLIC sz_cptr_t sz_rfind_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
  4036. sz_size_t load_length;
  4037. __mmask32 load_mask, matches_mask;
  4038. // To store the set in the register we need just 256 bits, but the `VPERMB` instruction
  4039. // we are going to invoke is surprisingly cheaper on ZMM registers.
  4040. sz_u512_vec_t text_vec, filter_vec;
  4041. filter_vec.ymms[0] = _mm256_loadu_epi64(&filter->_u64s[0]);
  4042. // We are going to view the `filter` at 8-bit word granularity.
  4043. sz_u512_vec_t filter_slice_offsets_vec;
  4044. sz_u512_vec_t filter_slice_vec;
  4045. sz_u512_vec_t offset_within_slice_vec;
  4046. sz_u512_vec_t mask_in_filter_slice_vec;
  4047. sz_u512_vec_t matches_vec;
  4048. while (length) {
  4049. // For every byte:
  4050. // 1. Find corresponding word in a set.
  4051. // 2. Produce a bitmask to check against that word.
  4052. load_length = sz_min_of_two(length, 32);
  4053. load_mask = _sz_u64_mask_until(load_length);
  4054. text_vec.ymms[0] = _mm256_maskz_loadu_epi8(load_mask, text + length - load_length);
  4055. // To shift right every byte by 3 bits we can use the GF2 affine transformations.
  4056. // https://wunkolo.github.io/post/2020/11/gf2p8affineqb-int8-shifting/
  4057. // After next line, all 8-bit offsets in the `filter_slice_offsets_vec` should be under 32.
  4058. filter_slice_offsets_vec.ymms[0] =
  4059. _mm256_gf2p8affine_epi64_epi8(text_vec.ymms[0], _mm256_set1_epi64x(0x0102040810204080ull << (3 * 8)), 0);
  4060. // After next line, `filter_slice_vec` will contain the right word from the set,
  4061. // needed to filter the presence of the byte in the set.
  4062. filter_slice_vec.ymms[0] = _mm256_permutexvar_epi8(filter_slice_offsets_vec.ymms[0], filter_vec.ymms[0]);
  4063. // After next line, all 8-bit offsets in the `filter_slice_offsets_vec` should be under 8.
  4064. offset_within_slice_vec.ymms[0] = _mm256_and_si256(text_vec.ymms[0], _mm256_set1_epi64x(0x0707070707070707ull));
  4065. // Instead of performing one more Galois Field operation, we can upcast to 16-bit integers,
  4066. // and perform the fift and intersection there.
  4067. filter_slice_vec.zmm = _mm512_cvtepi8_epi16(filter_slice_vec.ymms[0]);
  4068. offset_within_slice_vec.zmm = _mm512_cvtepi8_epi16(offset_within_slice_vec.ymms[0]);
  4069. mask_in_filter_slice_vec.zmm = _mm512_sllv_epi16(_mm512_set1_epi16(1), offset_within_slice_vec.zmm);
  4070. matches_vec.zmm = _mm512_and_si512(filter_slice_vec.zmm, mask_in_filter_slice_vec.zmm);
  4071. matches_mask = _mm512_mask_cmpneq_epi16_mask(load_mask, matches_vec.zmm, _mm512_setzero_si512());
  4072. if (matches_mask) {
  4073. int offset = sz_u32_clz(matches_mask);
  4074. return text + length - load_length + 32 - offset - 1;
  4075. }
  4076. else { length -= load_length; }
  4077. }
  4078. return SZ_NULL_CHAR;
  4079. }
  4080. /**
  4081. * Computes the Needleman Wunsch alignment score between two strings.
  4082. * The method uses 32-bit integers to accumulate the running score for every cell in the matrix.
  4083. * Assuming the costs of substitutions can be arbitrary signed 8-bit integers, the method is expected to be used
  4084. * on strings not exceeding 2^24 length or 16.7 million characters.
  4085. *
  4086. * Unlike the `_sz_edit_distance_skewed_diagonals_upto65k_avx512` method, this one uses signed integers to store
  4087. * the accumulated score. Moreover, it's primary bottleneck is the latency of gathering the substitution costs
  4088. * from the substitution matrix. If we use the diagonal order, we will be comparing a slice of the first string with
  4089. * a slice of the second. If we stick to the conventional horizontal order, we will be comparing one character against
  4090. * a slice, which is much easier to optimize. In that case we are sampling costs not from arbitrary parts of
  4091. * a 256 x 256 matrix, but from a single row!
  4092. */
  4093. SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( //
  4094. sz_cptr_t shorter, sz_size_t shorter_length, //
  4095. sz_cptr_t longer, sz_size_t longer_length, //
  4096. sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
  4097. // If one of the strings is empty - the edit distance is equal to the length of the other one
  4098. if (longer_length == 0) return (sz_ssize_t)shorter_length * gap;
  4099. if (shorter_length == 0) return (sz_ssize_t)longer_length * gap;
  4100. // Let's make sure that we use the amount proportional to the
  4101. // number of elements in the shorter string, not the larger.
  4102. if (shorter_length > longer_length) {
  4103. sz_pointer_swap((void **)&longer_length, (void **)&shorter_length);
  4104. sz_pointer_swap((void **)&longer, (void **)&shorter);
  4105. }
  4106. // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
  4107. sz_memory_allocator_t global_alloc;
  4108. if (!alloc) {
  4109. sz_memory_allocator_init_default(&global_alloc);
  4110. alloc = &global_alloc;
  4111. }
  4112. sz_size_t const max_length = 256ull * 256ull * 256ull;
  4113. sz_size_t const n = longer_length + 1;
  4114. sz_assert(n < max_length && "The length must fit into 24-bit integer. Otherwise use serial variant.");
  4115. sz_unused(longer_length && max_length);
  4116. sz_size_t buffer_length = sizeof(sz_i32_t) * n * 2;
  4117. sz_i32_t *distances = (sz_i32_t *)alloc->allocate(buffer_length, alloc->handle);
  4118. sz_i32_t *previous_distances = distances;
  4119. sz_i32_t *current_distances = previous_distances + n;
  4120. // Intialize the first row of the Levenshtein matrix with `iota`.
  4121. for (sz_size_t idx_longer = 0; idx_longer != n; ++idx_longer)
  4122. previous_distances[idx_longer] = (sz_ssize_t)idx_longer * gap;
  4123. /// Contains up to 16 consecutive characters from the longer string.
  4124. sz_u512_vec_t longer_vec;
  4125. sz_u512_vec_t cost_deletion_vec, cost_substitution_vec, lookup_substitution_vec, current_vec;
  4126. sz_u512_vec_t row_first_subs_vec, row_second_subs_vec, row_third_subs_vec, row_fourth_subs_vec;
  4127. sz_u512_vec_t shuffled_first_subs_vec, shuffled_second_subs_vec, shuffled_third_subs_vec, shuffled_fourth_subs_vec;
  4128. // Prepare constants and masks.
  4129. sz_u512_vec_t is_third_or_fourth_vec, is_second_or_fourth_vec, gap_vec;
  4130. {
  4131. char is_third_or_fourth_check, is_second_or_fourth_check;
  4132. *(sz_u8_t *)&is_third_or_fourth_check = 0x80, *(sz_u8_t *)&is_second_or_fourth_check = 0x40;
  4133. is_third_or_fourth_vec.zmm = _mm512_set1_epi8(is_third_or_fourth_check);
  4134. is_second_or_fourth_vec.zmm = _mm512_set1_epi8(is_second_or_fourth_check);
  4135. gap_vec.zmm = _mm512_set1_epi32(gap);
  4136. }
  4137. sz_u8_t const *shorter_unsigned = (sz_u8_t const *)shorter;
  4138. for (sz_size_t idx_shorter = 0; idx_shorter != shorter_length; ++idx_shorter) {
  4139. sz_i32_t last_in_row = current_distances[0] = (sz_ssize_t)(idx_shorter + 1) * gap;
  4140. // Load one row of the substitution matrix into four ZMM registers.
  4141. sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u;
  4142. row_first_subs_vec.zmm = _mm512_loadu_epi8(row_subs + 64 * 0);
  4143. row_second_subs_vec.zmm = _mm512_loadu_epi8(row_subs + 64 * 1);
  4144. row_third_subs_vec.zmm = _mm512_loadu_epi8(row_subs + 64 * 2);
  4145. row_fourth_subs_vec.zmm = _mm512_loadu_epi8(row_subs + 64 * 3);
  4146. // In the serial version we have one forward pass, that computes the deletion,
  4147. // insertion, and substitution costs at once.
  4148. // for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
  4149. // sz_ssize_t cost_deletion = previous_distances[idx_longer + 1] + gap;
  4150. // sz_ssize_t cost_insertion = current_distances[idx_longer] + gap;
  4151. // sz_ssize_t cost_substitution = previous_distances[idx_longer] + row_subs[longer_unsigned[idx_longer]];
  4152. // current_distances[idx_longer + 1] = sz_min_of_three(cost_deletion, cost_insertion, cost_substitution);
  4153. // }
  4154. //
  4155. // Given the complexity of handling the data-dependency between consecutive insertion cost computations
  4156. // within a Levenshtein matrix, the simplest design would be to vectorize every kind of cost computation
  4157. // separately.
  4158. // 1. Compute substitution costs for up to 64 characters at once, upcasting from 8-bit integers to 32.
  4159. // 2. Compute the pairwise minimum with deletion costs.
  4160. // 3. Inclusive prefix minimum computation to combine with addition costs.
  4161. // Proceeding with substitutions:
  4162. for (sz_size_t idx_longer = 0; idx_longer < longer_length; idx_longer += 64) {
  4163. sz_size_t register_length = sz_min_of_two(longer_length - idx_longer, 64);
  4164. __mmask64 mask = _sz_u64_mask_until(register_length);
  4165. longer_vec.zmm = _mm512_maskz_loadu_epi8(mask, longer + idx_longer);
  4166. // Blend the `row_(first|second|third|fourth)_subs_vec` into `current_vec`, picking the right source
  4167. // for every character in `longer_vec`. Before that, we need to permute the subsititution vectors.
  4168. // Only the bottom 6 bits of a byte are used in VPERB, so we don't even need to mask.
  4169. shuffled_first_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_first_subs_vec.zmm);
  4170. shuffled_second_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_second_subs_vec.zmm);
  4171. shuffled_third_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_third_subs_vec.zmm);
  4172. shuffled_fourth_subs_vec.zmm = _mm512_maskz_permutexvar_epi8(mask, longer_vec.zmm, row_fourth_subs_vec.zmm);
  4173. // To blend we can invoke three `_mm512_cmplt_epu8_mask`, but we can also achieve the same using
  4174. // the AND logical operation, checking the top two bits of every byte.
  4175. // Continuing this thought, we can use the VPTESTMB instruction to output the mask after the AND.
  4176. __mmask64 is_third_or_fourth = _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_third_or_fourth_vec.zmm);
  4177. __mmask64 is_second_or_fourth =
  4178. _mm512_mask_test_epi8_mask(mask, longer_vec.zmm, is_second_or_fourth_vec.zmm);
  4179. lookup_substitution_vec.zmm = _mm512_mask_blend_epi8(
  4180. is_third_or_fourth,
  4181. // Choose between the first and the second.
  4182. _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_first_subs_vec.zmm, shuffled_second_subs_vec.zmm),
  4183. // Choose between the third and the fourth.
  4184. _mm512_mask_blend_epi8(is_second_or_fourth, shuffled_third_subs_vec.zmm, shuffled_fourth_subs_vec.zmm));
  4185. // First, sign-extend lower and upper 16 bytes to 16-bit integers.
  4186. __m512i current_0_31_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 0));
  4187. __m512i current_32_63_vec = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(lookup_substitution_vec.zmm, 1));
  4188. // Now extend those 16-bit integers to 32-bit.
  4189. // This isn't free, same as the subsequent store, so we only want to do that for the populated lanes.
  4190. // To minimize the number of loads and stores, we can combine our substitution costs with the previous
  4191. // distances, containing the deletion costs.
  4192. {
  4193. cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32(mask, previous_distances + idx_longer);
  4194. cost_substitution_vec.zmm = _mm512_add_epi32(
  4195. cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 0)));
  4196. cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32(mask, previous_distances + 1 + idx_longer);
  4197. cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
  4198. current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
  4199. // Inclusive prefix minimum computation to combine with insertion costs.
  4200. // Simply disabling this operation results in 5x performance improvement, meaning
  4201. // that this operation is responsible for 80% of the total runtime.
  4202. // for (sz_size_t idx_longer = 0; idx_longer < longer_length; ++idx_longer) {
  4203. // current_distances[idx_longer + 1] =
  4204. // sz_max_of_two(current_distances[idx_longer] + gap, current_distances[idx_longer + 1]);
  4205. // }
  4206. //
  4207. // To perform the same operation in vectorized form, we need to perform a tree-like reduction,
  4208. // that will involve multiple steps. It's quite expensive and should be first tested in the
  4209. // "experimental" section.
  4210. //
  4211. // Another approach might be loop unrolling:
  4212. // current_vec.i32s[0] = last_in_row = sz_i32_max_of_two(current_vec.i32s[0], last_in_row + gap);
  4213. // current_vec.i32s[1] = last_in_row = sz_i32_max_of_two(current_vec.i32s[1], last_in_row + gap);
  4214. // current_vec.i32s[2] = last_in_row = sz_i32_max_of_two(current_vec.i32s[2], last_in_row + gap);
  4215. // ... yet this approach is also quite expensive.
  4216. for (int i = 0; i != 16; ++i)
  4217. current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
  4218. _mm512_mask_storeu_epi32(current_distances + idx_longer + 1, mask, current_vec.zmm);
  4219. }
  4220. // Export the values from 16 to 31.
  4221. if (register_length > 16) {
  4222. mask = _kshiftri_mask64(mask, 16);
  4223. cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32(mask, previous_distances + idx_longer + 16);
  4224. cost_substitution_vec.zmm = _mm512_add_epi32(
  4225. cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_0_31_vec, 1)));
  4226. cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32(mask, previous_distances + 1 + idx_longer + 16);
  4227. cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
  4228. current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
  4229. // Aggregate running insertion costs within the register.
  4230. for (int i = 0; i != 16; ++i)
  4231. current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
  4232. _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 16, mask, current_vec.zmm);
  4233. }
  4234. // Export the values from 32 to 47.
  4235. if (register_length > 32) {
  4236. mask = _kshiftri_mask64(mask, 16);
  4237. cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32(mask, previous_distances + idx_longer + 32);
  4238. cost_substitution_vec.zmm = _mm512_add_epi32(
  4239. cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 0)));
  4240. cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32(mask, previous_distances + 1 + idx_longer + 32);
  4241. cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
  4242. current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
  4243. // Aggregate running insertion costs within the register.
  4244. for (int i = 0; i != 16; ++i)
  4245. current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
  4246. _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 32, mask, current_vec.zmm);
  4247. }
  4248. // Export the values from 32 to 47.
  4249. if (register_length > 48) {
  4250. mask = _kshiftri_mask64(mask, 16);
  4251. cost_substitution_vec.zmm = _mm512_maskz_loadu_epi32(mask, previous_distances + idx_longer + 48);
  4252. cost_substitution_vec.zmm = _mm512_add_epi32(
  4253. cost_substitution_vec.zmm, _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(current_32_63_vec, 1)));
  4254. cost_deletion_vec.zmm = _mm512_maskz_loadu_epi32(mask, previous_distances + 1 + idx_longer + 48);
  4255. cost_deletion_vec.zmm = _mm512_add_epi32(cost_deletion_vec.zmm, gap_vec.zmm);
  4256. current_vec.zmm = _mm512_max_epi32(cost_substitution_vec.zmm, cost_deletion_vec.zmm);
  4257. // Aggregate running insertion costs within the register.
  4258. for (int i = 0; i != 16; ++i)
  4259. current_vec.i32s[i] = last_in_row = sz_max_of_two(current_vec.i32s[i], last_in_row + gap);
  4260. _mm512_mask_storeu_epi32(current_distances + idx_longer + 1 + 48, mask, current_vec.zmm);
  4261. }
  4262. }
  4263. // Swap previous_distances and current_distances pointers
  4264. sz_pointer_swap((void **)&previous_distances, (void **)&current_distances);
  4265. }
  4266. // Cache scalar before `free` call.
  4267. sz_ssize_t result = previous_distances[longer_length];
  4268. alloc->free(distances, buffer_length, alloc->handle);
  4269. return result;
  4270. }
  4271. SZ_INTERNAL sz_ssize_t sz_alignment_score_avx512( //
  4272. sz_cptr_t shorter, sz_size_t shorter_length, //
  4273. sz_cptr_t longer, sz_size_t longer_length, //
  4274. sz_error_cost_t const *subs, sz_error_cost_t gap, sz_memory_allocator_t *alloc) {
  4275. if (sz_max_of_two(shorter_length, longer_length) < (256ull * 256ull * 256ull))
  4276. return _sz_alignment_score_wagner_fisher_upto17m_avx512(shorter, shorter_length, longer, longer_length, subs,
  4277. gap, alloc);
  4278. else
  4279. return sz_alignment_score_serial(shorter, shorter_length, longer, longer_length, subs, gap, alloc);
  4280. }
  4281. #pragma clang attribute pop
  4282. #pragma GCC pop_options
  4283. #endif
  4284. #pragma endregion
  4285. /* @brief Implementation of the string search algorithms using the Arm NEON instruction set, available on 64-bit
  4286. * Arm processors. Implements: {substring search, character search, character set search} x {forward, reverse}.
  4287. */
  4288. #pragma region ARM NEON
  4289. #if SZ_USE_ARM_NEON
  4290. /**
  4291. * @brief Helper structure to simplify work with 64-bit words.
  4292. */
  4293. typedef union sz_u128_vec_t {
  4294. uint8x16_t u8x16;
  4295. uint16x8_t u16x8;
  4296. uint32x4_t u32x4;
  4297. uint64x2_t u64x2;
  4298. sz_u64_t u64s[2];
  4299. sz_u32_t u32s[4];
  4300. sz_u16_t u16s[8];
  4301. sz_u8_t u8s[16];
  4302. } sz_u128_vec_t;
  4303. SZ_INTERNAL sz_u64_t vreinterpretq_u8_u4(uint8x16_t vec) {
  4304. // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
  4305. // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
  4306. return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
  4307. }
  4308. SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  4309. sz_u64_t matches;
  4310. sz_u128_vec_t h_vec, n_vec, matches_vec;
  4311. n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
  4312. while (h_length >= 16) {
  4313. h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h);
  4314. matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
  4315. // In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
  4316. // But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
  4317. // the vector with a relative offsets array.
  4318. matches = vreinterpretq_u8_u4(matches_vec.u8x16);
  4319. if (matches) return h + sz_u64_ctz(matches) / 4;
  4320. h += 16, h_length -= 16;
  4321. }
  4322. return sz_find_byte_serial(h, h_length, n);
  4323. }
  4324. SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
  4325. sz_u64_t matches;
  4326. sz_u128_vec_t h_vec, n_vec, matches_vec;
  4327. n_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)n);
  4328. while (h_length >= 16) {
  4329. h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
  4330. matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
  4331. matches = vreinterpretq_u8_u4(matches_vec.u8x16);
  4332. if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
  4333. h_length -= 16;
  4334. }
  4335. return sz_rfind_byte_serial(h, h_length, n);
  4336. }
  4337. SZ_PUBLIC sz_u64_t _sz_find_charset_neon_register(sz_u128_vec_t h_vec, uint8x16_t set_top_vec_u8x16,
  4338. uint8x16_t set_bottom_vec_u8x16) {
  4339. // Once we've read the characters in the haystack, we want to
  4340. // compare them against our bitset. The serial version of that code
  4341. // would look like: `(set_->_u8s[c >> 3] & (1u << (c & 7u))) != 0`.
  4342. uint8x16_t byte_index_vec = vshrq_n_u8(h_vec.u8x16, 3);
  4343. uint8x16_t byte_mask_vec = vshlq_u8(vdupq_n_u8(1), vreinterpretq_s8_u8(vandq_u8(h_vec.u8x16, vdupq_n_u8(7))));
  4344. uint8x16_t matches_top_vec = vqtbl1q_u8(set_top_vec_u8x16, byte_index_vec);
  4345. // The table lookup instruction in NEON replies to out-of-bound requests with zeros.
  4346. // The values in `byte_index_vec` all fall in [0; 32). So for values under 16, substracting 16 will underflow
  4347. // and map into interval [240, 256). Meaning that those will be populated with zeros and we can safely
  4348. // merge `matches_top_vec` and `matches_bottom_vec` with a bitwise OR.
  4349. uint8x16_t matches_bottom_vec = vqtbl1q_u8(set_bottom_vec_u8x16, vsubq_u8(byte_index_vec, vdupq_n_u8(16)));
  4350. uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
  4351. // Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
  4352. matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
  4353. return vreinterpretq_u8_u4(matches_vec);
  4354. }
  4355. SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  4356. // This almost never fires, but it's better to be safe than sorry.
  4357. if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
  4358. if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
  4359. // Scan through the string.
  4360. // Assuming how tiny the Arm NEON registers are, we should avoid internal branches at all costs.
  4361. // That's why, for smaller needles, we use different loops.
  4362. if (n_length == 2) {
  4363. // Broadcast needle characters into SIMD registers.
  4364. sz_u64_t matches;
  4365. sz_u128_vec_t h_first_vec, h_last_vec, n_first_vec, n_last_vec, matches_vec;
  4366. // Dealing with 16-bit values, we can load 2 registers at a time and compare 31 possible offsets
  4367. // in a single loop iteration.
  4368. n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
  4369. n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
  4370. for (; h_length >= 17; h += 16, h_length -= 16) {
  4371. h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
  4372. h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
  4373. matches_vec.u8x16 =
  4374. vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
  4375. matches = vreinterpretq_u8_u4(matches_vec.u8x16);
  4376. if (matches) return h + sz_u64_ctz(matches) / 4;
  4377. }
  4378. }
  4379. else if (n_length == 3) {
  4380. // Broadcast needle characters into SIMD registers.
  4381. sz_u64_t matches;
  4382. sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
  4383. // Comparing 24-bit values is a bumer. Being lazy, I went with the same approach
  4384. // as when searching for string over 4 characters long. I only avoid the last comparison.
  4385. n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[0]);
  4386. n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[1]);
  4387. n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[2]);
  4388. for (; h_length >= 18; h += 16, h_length -= 16) {
  4389. h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
  4390. h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
  4391. h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 2));
  4392. matches_vec.u8x16 = vandq_u8( //
  4393. vandq_u8( //
  4394. vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
  4395. vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
  4396. vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
  4397. matches = vreinterpretq_u8_u4(matches_vec.u8x16);
  4398. if (matches) return h + sz_u64_ctz(matches) / 4;
  4399. }
  4400. }
  4401. else {
  4402. // Pick the parts of the needle that are worth comparing.
  4403. sz_size_t offset_first, offset_mid, offset_last;
  4404. _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
  4405. // Broadcast those characters into SIMD registers.
  4406. sz_u64_t matches;
  4407. sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
  4408. n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
  4409. n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
  4410. n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
  4411. // Walk through the string.
  4412. for (; h_length >= n_length + 16; h += 16, h_length -= 16) {
  4413. h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_first));
  4414. h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_mid));
  4415. h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + offset_last));
  4416. matches_vec.u8x16 = vandq_u8( //
  4417. vandq_u8( //
  4418. vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
  4419. vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
  4420. vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
  4421. matches = vreinterpretq_u8_u4(matches_vec.u8x16);
  4422. while (matches) {
  4423. int potential_offset = sz_u64_ctz(matches) / 4;
  4424. if (sz_equal(h + potential_offset, n, n_length)) return h + potential_offset;
  4425. matches &= matches - 1;
  4426. }
  4427. }
  4428. }
  4429. return sz_find_serial(h, h_length, n, n_length);
  4430. }
  4431. SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  4432. // This almost never fires, but it's better to be safe than sorry.
  4433. if (h_length < n_length || !n_length) return SZ_NULL_CHAR;
  4434. if (n_length == 1) return sz_rfind_byte_neon(h, h_length, n);
  4435. // Pick the parts of the needle that are worth comparing.
  4436. sz_size_t offset_first, offset_mid, offset_last;
  4437. _sz_locate_needle_anomalies(n, n_length, &offset_first, &offset_mid, &offset_last);
  4438. // Will contain 4 bits per character.
  4439. sz_u64_t matches;
  4440. sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec, matches_vec;
  4441. n_first_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_first]);
  4442. n_mid_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_mid]);
  4443. n_last_vec.u8x16 = vld1q_dup_u8((sz_u8_t const *)&n[offset_last]);
  4444. sz_cptr_t h_reversed;
  4445. for (; h_length >= n_length + 16; h_length -= 16) {
  4446. h_reversed = h + h_length - n_length - 16 + 1;
  4447. h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_first));
  4448. h_mid_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_mid));
  4449. h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h_reversed + offset_last));
  4450. matches_vec.u8x16 = vandq_u8( //
  4451. vandq_u8( //
  4452. vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
  4453. vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
  4454. vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
  4455. matches = vreinterpretq_u8_u4(matches_vec.u8x16);
  4456. while (matches) {
  4457. int potential_offset = sz_u64_clz(matches) / 4;
  4458. if (sz_equal(h + h_length - n_length - potential_offset, n, n_length))
  4459. return h + h_length - n_length - potential_offset;
  4460. sz_assert((matches & (1ull << (63 - potential_offset * 4))) != 0 &&
  4461. "The bit must be set before we squash it");
  4462. matches &= ~(1ull << (63 - potential_offset * 4));
  4463. }
  4464. }
  4465. return sz_rfind_serial(h, h_length, n, n_length);
  4466. }
  4467. SZ_PUBLIC sz_cptr_t sz_find_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
  4468. sz_u64_t matches;
  4469. sz_u128_vec_t h_vec;
  4470. uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
  4471. uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
  4472. for (; h_length >= 16; h += 16, h_length -= 16) {
  4473. h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h));
  4474. matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
  4475. if (matches) return h + sz_u64_ctz(matches) / 4;
  4476. }
  4477. return sz_find_charset_serial(h, h_length, set);
  4478. }
  4479. SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t h, sz_size_t h_length, sz_charset_t const *set) {
  4480. sz_u64_t matches;
  4481. sz_u128_vec_t h_vec;
  4482. uint8x16_t set_top_vec_u8x16 = vld1q_u8(&set->_u8s[0]);
  4483. uint8x16_t set_bottom_vec_u8x16 = vld1q_u8(&set->_u8s[16]);
  4484. // Check `sz_find_charset_neon` for explanations.
  4485. for (; h_length >= 16; h_length -= 16) {
  4486. h_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h) + h_length - 16);
  4487. matches = _sz_find_charset_neon_register(h_vec, set_top_vec_u8x16, set_bottom_vec_u8x16);
  4488. if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
  4489. }
  4490. return sz_rfind_charset_serial(h, h_length, set);
  4491. }
  4492. #endif // Arm Neon
  4493. #pragma endregion
  4494. /*
  4495. * @brief Pick the right implementation for the string search algorithms.
  4496. */
  4497. #pragma region Compile-Time Dispatching
  4498. SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t ins, sz_size_t length) { return sz_hash_serial(ins, length); }
  4499. SZ_PUBLIC void sz_tolower(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_tolower_serial(ins, length, outs); }
  4500. SZ_PUBLIC void sz_toupper(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toupper_serial(ins, length, outs); }
  4501. SZ_PUBLIC void sz_toascii(sz_cptr_t ins, sz_size_t length, sz_ptr_t outs) { sz_toascii_serial(ins, length, outs); }
  4502. SZ_PUBLIC sz_bool_t sz_isascii(sz_cptr_t ins, sz_size_t length) { return sz_isascii_serial(ins, length); }
  4503. SZ_PUBLIC void sz_hashes_fingerprint(sz_cptr_t start, sz_size_t length, sz_size_t window_length, sz_ptr_t fingerprint,
  4504. sz_size_t fingerprint_bytes) {
  4505. sz_bool_t fingerprint_length_is_power_of_two = (sz_bool_t)((fingerprint_bytes & (fingerprint_bytes - 1)) == 0);
  4506. sz_string_view_t fingerprint_buffer = {fingerprint, fingerprint_bytes};
  4507. // There are several issues related to the fingerprinting algorithm.
  4508. // First, the memory traversal order is important.
  4509. // https://blog.stuffedcow.net/2015/08/pagewalk-coherence/
  4510. // In most cases the fingerprint length will be a power of two.
  4511. if (fingerprint_length_is_power_of_two == sz_false_k)
  4512. sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_non_pow2_callback, &fingerprint_buffer);
  4513. else
  4514. sz_hashes(start, length, window_length, 1, _sz_hashes_fingerprint_pow2_callback, &fingerprint_buffer);
  4515. }
  4516. #if !SZ_DYNAMIC_DISPATCH
  4517. SZ_DYNAMIC sz_bool_t sz_equal(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
  4518. #if SZ_USE_X86_AVX512
  4519. return sz_equal_avx512(a, b, length);
  4520. #else
  4521. return sz_equal_serial(a, b, length);
  4522. #endif
  4523. }
  4524. SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {
  4525. #if SZ_USE_X86_AVX512
  4526. return sz_order_avx512(a, a_length, b, b_length);
  4527. #else
  4528. return sz_order_serial(a, a_length, b, b_length);
  4529. #endif
  4530. }
  4531. SZ_DYNAMIC void sz_copy(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
  4532. #if SZ_USE_X86_AVX512
  4533. sz_copy_avx512(target, source, length);
  4534. #elif SZ_USE_X86_AVX2
  4535. sz_copy_avx2(target, source, length);
  4536. #else
  4537. sz_copy_serial(target, source, length);
  4538. #endif
  4539. }
  4540. SZ_DYNAMIC void sz_move(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
  4541. #if SZ_USE_X86_AVX512
  4542. sz_move_avx512(target, source, length);
  4543. #elif SZ_USE_X86_AVX2
  4544. sz_move_avx2(target, source, length);
  4545. #else
  4546. sz_move_serial(target, source, length);
  4547. #endif
  4548. }
  4549. SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
  4550. #if SZ_USE_X86_AVX512
  4551. sz_fill_avx512(target, length, value);
  4552. #elif SZ_USE_X86_AVX2
  4553. sz_fill_avx2(target, length, value);
  4554. #else
  4555. sz_fill_serial(target, length, value);
  4556. #endif
  4557. }
  4558. SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
  4559. #if SZ_USE_X86_AVX512
  4560. return sz_find_byte_avx512(haystack, h_length, needle);
  4561. #elif SZ_USE_X86_AVX2
  4562. return sz_find_byte_avx2(haystack, h_length, needle);
  4563. #elif SZ_USE_ARM_NEON
  4564. return sz_find_byte_neon(haystack, h_length, needle);
  4565. #else
  4566. return sz_find_byte_serial(haystack, h_length, needle);
  4567. #endif
  4568. }
  4569. SZ_DYNAMIC sz_cptr_t sz_rfind_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
  4570. #if SZ_USE_X86_AVX512
  4571. return sz_rfind_byte_avx512(haystack, h_length, needle);
  4572. #elif SZ_USE_X86_AVX2
  4573. return sz_rfind_byte_avx2(haystack, h_length, needle);
  4574. #elif SZ_USE_ARM_NEON
  4575. return sz_rfind_byte_neon(haystack, h_length, needle);
  4576. #else
  4577. return sz_rfind_byte_serial(haystack, h_length, needle);
  4578. #endif
  4579. }
  4580. SZ_DYNAMIC sz_cptr_t sz_find(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
  4581. #if SZ_USE_X86_AVX512
  4582. return sz_find_avx512(haystack, h_length, needle, n_length);
  4583. #elif SZ_USE_X86_AVX2
  4584. return sz_find_avx2(haystack, h_length, needle, n_length);
  4585. #elif SZ_USE_ARM_NEON
  4586. return sz_find_neon(haystack, h_length, needle, n_length);
  4587. #else
  4588. return sz_find_serial(haystack, h_length, needle, n_length);
  4589. #endif
  4590. }
  4591. SZ_DYNAMIC sz_cptr_t sz_rfind(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle, sz_size_t n_length) {
  4592. #if SZ_USE_X86_AVX512
  4593. return sz_rfind_avx512(haystack, h_length, needle, n_length);
  4594. #elif SZ_USE_X86_AVX2
  4595. return sz_rfind_avx2(haystack, h_length, needle, n_length);
  4596. #elif SZ_USE_ARM_NEON
  4597. return sz_rfind_neon(haystack, h_length, needle, n_length);
  4598. #else
  4599. return sz_rfind_serial(haystack, h_length, needle, n_length);
  4600. #endif
  4601. }
  4602. SZ_DYNAMIC sz_cptr_t sz_find_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
  4603. #if SZ_USE_X86_AVX512
  4604. return sz_find_charset_avx512(text, length, set);
  4605. #elif SZ_USE_ARM_NEON
  4606. return sz_find_charset_neon(text, length, set);
  4607. #else
  4608. return sz_find_charset_serial(text, length, set);
  4609. #endif
  4610. }
  4611. SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_charset_t const *set) {
  4612. #if SZ_USE_X86_AVX512
  4613. return sz_rfind_charset_avx512(text, length, set);
  4614. #elif SZ_USE_ARM_NEON
  4615. return sz_rfind_charset_neon(text, length, set);
  4616. #else
  4617. return sz_rfind_charset_serial(text, length, set);
  4618. #endif
  4619. }
  4620. SZ_DYNAMIC sz_size_t sz_hamming_distance( //
  4621. sz_cptr_t a, sz_size_t a_length, //
  4622. sz_cptr_t b, sz_size_t b_length, //
  4623. sz_size_t bound) {
  4624. return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
  4625. }
  4626. SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
  4627. sz_cptr_t a, sz_size_t a_length, //
  4628. sz_cptr_t b, sz_size_t b_length, //
  4629. sz_size_t bound) {
  4630. return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
  4631. }
  4632. SZ_DYNAMIC sz_size_t sz_edit_distance( //
  4633. sz_cptr_t a, sz_size_t a_length, //
  4634. sz_cptr_t b, sz_size_t b_length, //
  4635. sz_size_t bound, sz_memory_allocator_t *alloc) {
  4636. #if SZ_USE_X86_AVX512
  4637. return sz_edit_distance_avx512(a, a_length, b, b_length, bound, alloc);
  4638. #else
  4639. return sz_edit_distance_serial(a, a_length, b, b_length, bound, alloc);
  4640. #endif
  4641. }
  4642. SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
  4643. sz_cptr_t a, sz_size_t a_length, //
  4644. sz_cptr_t b, sz_size_t b_length, //
  4645. sz_size_t bound, sz_memory_allocator_t *alloc) {
  4646. return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
  4647. }
  4648. SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
  4649. sz_error_cost_t const *subs, sz_error_cost_t gap,
  4650. sz_memory_allocator_t *alloc) {
  4651. #if SZ_USE_X86_AVX512
  4652. return sz_alignment_score_avx512(a, a_length, b, b_length, subs, gap, alloc);
  4653. #else
  4654. return sz_alignment_score_serial(a, a_length, b, b_length, subs, gap, alloc);
  4655. #endif
  4656. }
  4657. SZ_DYNAMIC void sz_hashes(sz_cptr_t text, sz_size_t length, sz_size_t window_length, sz_size_t window_step, //
  4658. sz_hash_callback_t callback, void *callback_handle) {
  4659. #if SZ_USE_X86_AVX512
  4660. sz_hashes_avx512(text, length, window_length, window_step, callback, callback_handle);
  4661. #elif SZ_USE_X86_AVX2
  4662. sz_hashes_avx2(text, length, window_length, window_step, callback, callback_handle);
  4663. #else
  4664. sz_hashes_serial(text, length, window_length, window_step, callback, callback_handle);
  4665. #endif
  4666. }
  4667. SZ_DYNAMIC sz_cptr_t sz_find_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  4668. sz_charset_t set;
  4669. sz_charset_init(&set);
  4670. for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
  4671. return sz_find_charset(h, h_length, &set);
  4672. }
  4673. SZ_DYNAMIC sz_cptr_t sz_find_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  4674. sz_charset_t set;
  4675. sz_charset_init(&set);
  4676. for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
  4677. sz_charset_invert(&set);
  4678. return sz_find_charset(h, h_length, &set);
  4679. }
  4680. SZ_DYNAMIC sz_cptr_t sz_rfind_char_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  4681. sz_charset_t set;
  4682. sz_charset_init(&set);
  4683. for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
  4684. return sz_rfind_charset(h, h_length, &set);
  4685. }
  4686. SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
  4687. sz_charset_t set;
  4688. sz_charset_init(&set);
  4689. for (; n_length; ++n, --n_length) sz_charset_add(&set, *n);
  4690. sz_charset_invert(&set);
  4691. return sz_rfind_charset(h, h_length, &set);
  4692. }
  4693. SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
  4694. sz_random_generator_t generator, void *generator_user_data) {
  4695. sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
  4696. }
  4697. #endif
  4698. #pragma endregion
  4699. #ifdef __cplusplus
  4700. #pragma GCC diagnostic pop
  4701. }
  4702. #endif // __cplusplus
  4703. #endif // STRINGZILLA_H_