Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

stringzilla.hpp 167KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838
  1. /**
  2. * @brief StringZilla C++ wrapper improving over the performance of `std::string_view` and `std::string`,
  3. * mostly for substring search, adding approximate matching functionality, and C++23 functionality
  4. * to a C++11 compatible implementation.
  5. *
  6. * This implementation is aiming to be compatible with C++11, while implementing the C++23 functionality.
  7. * By default, it includes C++ STL headers, but that can be avoided to minimize compilation overhead.
  8. * https://artificial-mind.net/projects/compile-health/
  9. *
  10. * @see StringZilla: https://github.com/ashvardanian/StringZilla/blob/main/README.md
  11. * @see C++ Standard String: https://en.cppreference.com/w/cpp/header/string
  12. *
  13. * @file stringzilla.hpp
  14. * @author Ash Vardanian
  15. */
  16. #ifndef STRINGZILLA_HPP_
  17. #define STRINGZILLA_HPP_
  18. /**
  19. * @brief When set to 1, the library will include the C++ STL headers and implement
  20. * automatic conversion from and to `std::stirng_view` and `std::basic_string<any_allocator>`.
  21. */
  22. #ifndef SZ_AVOID_STL
  23. #define SZ_AVOID_STL (0) // true or false
  24. #endif
  25. /* We need to detect the version of the C++ language we are compiled with.
  26. * This will affect recent features like `operator<=>` and tests against STL.
  27. */
  28. #define SZ_DETECT_CPP_23 (__cplusplus >= 202101L)
  29. #define SZ_DETECT_CPP20 (__cplusplus >= 202002L)
  30. #define SZ_DETECT_CPP_17 (__cplusplus >= 201703L)
  31. #define SZ_DETECT_CPP14 (__cplusplus >= 201402L)
  32. #define SZ_DETECT_CPP_11 (__cplusplus >= 201103L)
  33. #define SZ_DETECT_CPP_98 (__cplusplus >= 199711L)
  34. /**
  35. * @brief The `constexpr` keyword has different applicability scope in different C++ versions.
  36. * Useful for STL conversion operators, as several `std::string` members are `constexpr` in C++20.
  37. */
  38. #if SZ_DETECT_CPP20
  39. #define sz_constexpr_if_cpp20 constexpr
  40. #else
  41. #define sz_constexpr_if_cpp20
  42. #endif
  43. #if !SZ_AVOID_STL
  44. #include <array>
  45. #include <bitset>
  46. #include <string>
  47. #include <vector>
  48. #if SZ_DETECT_CPP_17 && __cpp_lib_string_view
  49. #include <string_view>
  50. #endif
  51. #endif
  52. #include <cassert> // `assert`
  53. #include <cstddef> // `std::size_t`
  54. #include <cstdint> // `std::int8_t`
  55. #include <iosfwd> // `std::basic_ostream`
  56. #include <stdexcept> // `std::out_of_range`
  57. #include <utility> // `std::swap`
  58. #include <stringzilla/stringzilla.h>
  59. namespace ashvardanian {
  60. namespace stringzilla {
  61. template <typename>
  62. class basic_charset;
  63. template <typename>
  64. class basic_string_slice;
  65. template <typename, typename>
  66. class basic_string;
  67. using string_span = basic_string_slice<char>;
  68. using string_view = basic_string_slice<char const>;
  69. template <std::size_t count_characters>
  70. using carray = char[count_characters];
  71. #pragma region Character Sets
  72. /**
  73. * @brief The concatenation of the `ascii_lowercase` and `ascii_uppercase`. This value is not locale-dependent.
  74. * https://docs.python.org/3/library/string.html#string.ascii_letters
  75. */
  76. inline carray<52> const &ascii_letters() noexcept {
  77. static carray<52> const all = {
  78. //
  79. 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
  80. 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
  81. 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  82. };
  83. return all;
  84. }
  85. /**
  86. * @brief The lowercase letters "abcdefghijklmnopqrstuvwxyz". This value is not locale-dependent.
  87. * https://docs.python.org/3/library/string.html#string.ascii_lowercase
  88. */
  89. inline carray<26> const &ascii_lowercase() noexcept {
  90. static carray<26> const all = {
  91. //
  92. 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  93. 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  94. };
  95. return all;
  96. }
  97. /**
  98. * @brief The uppercase letters "ABCDEFGHIJKLMNOPQRSTUVWXYZ". This value is not locale-dependent.
  99. * https://docs.python.org/3/library/string.html#string.ascii_uppercase
  100. */
  101. inline carray<26> const &ascii_uppercase() noexcept {
  102. static carray<26> const all = {
  103. //
  104. 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
  105. 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  106. };
  107. return all;
  108. }
  109. /**
  110. * @brief ASCII characters which are considered printable.
  111. * A combination of `digits`, `ascii_letters`, `punctuation`, and `whitespace`.
  112. * https://docs.python.org/3/library/string.html#string.printable
  113. */
  114. inline carray<100> const &ascii_printables() noexcept {
  115. static carray<100> const all = {
  116. //
  117. '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
  118. 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D',
  119. 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
  120. 'Y', 'Z', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<',
  121. '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' ', '\t', '\n', '\r', '\f', '\v',
  122. };
  123. return all;
  124. }
  125. /**
  126. * @brief Non-printable ASCII control characters.
  127. * Includes all codes from 0 to 31 and 127.
  128. */
  129. inline carray<33> const &ascii_controls() noexcept {
  130. static carray<33> const all = {
  131. //
  132. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
  133. 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 127,
  134. };
  135. return all;
  136. }
  137. /**
  138. * @brief The digits "0123456789".
  139. * https://docs.python.org/3/library/string.html#string.digits
  140. */
  141. inline carray<10> const &digits() noexcept {
  142. static carray<10> const all = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'};
  143. return all;
  144. }
  145. /**
  146. * @brief The letters "0123456789abcdefABCDEF".
  147. * https://docs.python.org/3/library/string.html#string.hexdigits
  148. */
  149. inline carray<22> const &hexdigits() noexcept {
  150. static carray<22> const all = {
  151. //
  152. '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', //
  153. 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F',
  154. };
  155. return all;
  156. }
  157. /**
  158. * @brief The letters "01234567".
  159. * https://docs.python.org/3/library/string.html#string.octdigits
  160. */
  161. inline carray<8> const &octdigits() noexcept {
  162. static carray<8> const all = {'0', '1', '2', '3', '4', '5', '6', '7'};
  163. return all;
  164. }
  165. /**
  166. * @brief ASCII characters considered punctuation characters in the C locale:
  167. * !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~.
  168. * https://docs.python.org/3/library/string.html#string.punctuation
  169. */
  170. inline carray<32> const &punctuation() noexcept {
  171. static carray<32> const all = {
  172. //
  173. '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':',
  174. ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~',
  175. };
  176. return all;
  177. }
  178. /**
  179. * @brief ASCII characters that are considered whitespace.
  180. * This includes space, tab, linefeed, return, formfeed, and vertical tab.
  181. * https://docs.python.org/3/library/string.html#string.whitespace
  182. */
  183. inline carray<6> const &whitespaces() noexcept {
  184. static carray<6> const all = {' ', '\t', '\n', '\r', '\f', '\v'};
  185. return all;
  186. }
  187. /**
  188. * @brief ASCII characters that are considered line delimiters.
  189. * https://docs.python.org/3/library/stdtypes.html#str.splitlines
  190. */
  191. inline carray<8> const &newlines() noexcept {
  192. static carray<8> const all = {'\n', '\r', '\f', '\v', '\x1C', '\x1D', '\x1E', '\x85'};
  193. return all;
  194. }
  195. /**
  196. * @brief ASCII characters forming the BASE64 encoding alphabet.
  197. */
  198. inline carray<64> const &base64() noexcept {
  199. static carray<64> const all = {
  200. //
  201. 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
  202. 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  203. 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/',
  204. };
  205. return all;
  206. }
  207. /**
  208. * @brief A set of characters represented as a bitset with 256 slots.
  209. */
  210. template <typename char_type_ = char>
  211. class basic_charset {
  212. sz_charset_t bitset_;
  213. public:
  214. using char_type = char_type_;
  215. basic_charset() noexcept {
  216. // ! Instead of relying on the `sz_charset_init`, we have to reimplement it to support `constexpr`.
  217. bitset_._u64s[0] = 0, bitset_._u64s[1] = 0, bitset_._u64s[2] = 0, bitset_._u64s[3] = 0;
  218. }
  219. explicit basic_charset(std::initializer_list<char_type> chars) noexcept : basic_charset() {
  220. // ! Instead of relying on the `sz_charset_add(&bitset_, c)`, we have to reimplement it to support `constexpr`.
  221. for (auto c : chars) bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
  222. }
  223. template <std::size_t count_characters>
  224. explicit basic_charset(char_type const (&chars)[count_characters]) noexcept : basic_charset() {
  225. static_assert(count_characters > 0, "Character array cannot be empty");
  226. for (std::size_t i = 0; i < count_characters - 1; ++i) { // count_characters - 1 to exclude the null terminator
  227. char_type c = chars[i];
  228. bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
  229. }
  230. }
  231. template <std::size_t count_characters>
  232. explicit basic_charset(std::array<char_type, count_characters> const &chars) noexcept : basic_charset() {
  233. static_assert(count_characters > 0, "Character array cannot be empty");
  234. for (std::size_t i = 0; i < count_characters - 1; ++i) { // count_characters - 1 to exclude the null terminator
  235. char_type c = chars[i];
  236. bitset_._u64s[sz_bitcast(sz_u8_t, c) >> 6] |= (1ull << (sz_bitcast(sz_u8_t, c) & 63u));
  237. }
  238. }
  239. basic_charset(basic_charset const &other) noexcept : bitset_(other.bitset_) {}
  240. basic_charset &operator=(basic_charset const &other) noexcept {
  241. bitset_ = other.bitset_;
  242. return *this;
  243. }
  244. basic_charset operator|(basic_charset other) const noexcept {
  245. basic_charset result = *this;
  246. result.bitset_._u64s[0] |= other.bitset_._u64s[0], result.bitset_._u64s[1] |= other.bitset_._u64s[1],
  247. result.bitset_._u64s[2] |= other.bitset_._u64s[2], result.bitset_._u64s[3] |= other.bitset_._u64s[3];
  248. return *this;
  249. }
  250. inline basic_charset &add(char_type c) noexcept {
  251. sz_charset_add(&bitset_, sz_bitcast(sz_u8_t, c));
  252. return *this;
  253. }
  254. inline sz_charset_t &raw() noexcept { return bitset_; }
  255. inline sz_charset_t const &raw() const noexcept { return bitset_; }
  256. inline bool contains(char_type c) const noexcept { return sz_charset_contains(&bitset_, sz_bitcast(sz_u8_t, c)); }
  257. inline basic_charset inverted() const noexcept {
  258. basic_charset result = *this;
  259. sz_charset_invert(&result.bitset_);
  260. return result;
  261. }
  262. };
  263. using char_set = basic_charset<char>;
  264. inline char_set ascii_letters_set() { return char_set {ascii_letters()}; }
  265. inline char_set ascii_lowercase_set() { return char_set {ascii_lowercase()}; }
  266. inline char_set ascii_uppercase_set() { return char_set {ascii_uppercase()}; }
  267. inline char_set ascii_printables_set() { return char_set {ascii_printables()}; }
  268. inline char_set ascii_controls_set() { return char_set {ascii_controls()}; }
  269. inline char_set digits_set() { return char_set {digits()}; }
  270. inline char_set hexdigits_set() { return char_set {hexdigits()}; }
  271. inline char_set octdigits_set() { return char_set {octdigits()}; }
  272. inline char_set punctuation_set() { return char_set {punctuation()}; }
  273. inline char_set whitespaces_set() { return char_set {whitespaces()}; }
  274. inline char_set newlines_set() { return char_set {newlines()}; }
  275. inline char_set base64_set() { return char_set {base64()}; }
  276. #pragma endregion
  277. #pragma region Ranges of Search Matches
  278. struct end_sentinel_type {};
  279. struct include_overlaps_type {};
  280. struct exclude_overlaps_type {};
  281. #if SZ_DETECT_CPP_17
  282. inline static constexpr end_sentinel_type end_sentinel;
  283. inline static constexpr include_overlaps_type include_overlaps;
  284. inline static constexpr exclude_overlaps_type exclude_overlaps;
  285. #endif
  286. /**
  287. * @brief Zero-cost wrapper around the `.find` member function of string-like classes.
  288. */
  289. template <typename string_type_, typename overlaps_type = include_overlaps_type>
  290. struct matcher_find {
  291. using size_type = typename string_type_::size_type;
  292. string_type_ needle_;
  293. matcher_find(string_type_ needle = {}) noexcept : needle_(needle) {}
  294. size_type needle_length() const noexcept { return needle_.length(); }
  295. size_type operator()(string_type_ haystack) const noexcept { return haystack.find(needle_); }
  296. size_type skip_length() const noexcept {
  297. // TODO: Apply Galil rule to match repetitive patterns in strictly linear time.
  298. return std::is_same<overlaps_type, include_overlaps_type>() ? 1 : needle_.length();
  299. }
  300. };
  301. /**
  302. * @brief Zero-cost wrapper around the `.rfind` member function of string-like classes.
  303. */
  304. template <typename string_type_, typename overlaps_type = include_overlaps_type>
  305. struct matcher_rfind {
  306. using size_type = typename string_type_::size_type;
  307. string_type_ needle_;
  308. matcher_rfind(string_type_ needle = {}) noexcept : needle_(needle) {}
  309. size_type needle_length() const noexcept { return needle_.length(); }
  310. size_type operator()(string_type_ haystack) const noexcept { return haystack.rfind(needle_); }
  311. size_type skip_length() const noexcept {
  312. // TODO: Apply Galil rule to match repetitive patterns in strictly linear time.
  313. return std::is_same<overlaps_type, include_overlaps_type>() ? 1 : needle_.length();
  314. }
  315. };
  316. /**
  317. * @brief Zero-cost wrapper around the `.find_first_of` member function of string-like classes.
  318. */
  319. template <typename haystack_type, typename needles_type = haystack_type>
  320. struct matcher_find_first_of {
  321. using size_type = typename haystack_type::size_type;
  322. needles_type needles_;
  323. constexpr size_type needle_length() const noexcept { return 1; }
  324. constexpr size_type skip_length() const noexcept { return 1; }
  325. size_type operator()(haystack_type haystack) const noexcept { return haystack.find_first_of(needles_); }
  326. };
  327. /**
  328. * @brief Zero-cost wrapper around the `.find_last_of` member function of string-like classes.
  329. */
  330. template <typename haystack_type, typename needles_type = haystack_type>
  331. struct matcher_find_last_of {
  332. using size_type = typename haystack_type::size_type;
  333. needles_type needles_;
  334. constexpr size_type needle_length() const noexcept { return 1; }
  335. constexpr size_type skip_length() const noexcept { return 1; }
  336. size_type operator()(haystack_type haystack) const noexcept { return haystack.find_last_of(needles_); }
  337. };
  338. /**
  339. * @brief Zero-cost wrapper around the `.find_first_not_of` member function of string-like classes.
  340. */
  341. template <typename haystack_type, typename needles_type = haystack_type>
  342. struct matcher_find_first_not_of {
  343. using size_type = typename haystack_type::size_type;
  344. needles_type needles_;
  345. constexpr size_type needle_length() const noexcept { return 1; }
  346. constexpr size_type skip_length() const noexcept { return 1; }
  347. size_type operator()(haystack_type haystack) const noexcept { return haystack.find_first_not_of(needles_); }
  348. };
  349. /**
  350. * @brief Zero-cost wrapper around the `.find_last_not_of` member function of string-like classes.
  351. */
  352. template <typename haystack_type, typename needles_type = haystack_type>
  353. struct matcher_find_last_not_of {
  354. using size_type = typename haystack_type::size_type;
  355. needles_type needles_;
  356. constexpr size_type needle_length() const noexcept { return 1; }
  357. constexpr size_type skip_length() const noexcept { return 1; }
  358. size_type operator()(haystack_type haystack) const noexcept { return haystack.find_last_not_of(needles_); }
  359. };
  360. /**
  361. * @brief A range of string slices representing the matches of a substring search.
  362. * Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
  363. * Similar to a pair of `boost::algorithm::find_iterator`.
  364. */
  365. template <typename string_type_, typename matcher_type_>
  366. class range_matches {
  367. public:
  368. using string_type = string_type_;
  369. using matcher_type = matcher_type_;
  370. private:
  371. matcher_type matcher_;
  372. string_type haystack_;
  373. public:
  374. using size_type = std::size_t;
  375. using difference_type = std::ptrdiff_t;
  376. using value_type = string_type;
  377. using pointer = string_type; // Needed for compatibility with STL container constructors.
  378. using reference = string_type; // Needed for compatibility with STL container constructors.
  379. range_matches(string_type haystack, matcher_type needle) noexcept : matcher_(needle), haystack_(haystack) {}
  380. class iterator {
  381. matcher_type matcher_;
  382. string_type remaining_;
  383. public:
  384. using iterator_category = std::forward_iterator_tag;
  385. using difference_type = std::ptrdiff_t;
  386. using value_type = string_type;
  387. using pointer = string_type; // Needed for compatibility with STL container constructors.
  388. using reference = string_type; // Needed for compatibility with STL container constructors.
  389. iterator(string_type haystack, matcher_type matcher) noexcept : matcher_(matcher), remaining_(haystack) {
  390. auto position = matcher_(remaining_);
  391. remaining_.remove_prefix(position != string_type::npos ? position : remaining_.size());
  392. }
  393. pointer operator->() const noexcept = delete;
  394. value_type operator*() const noexcept { return remaining_.substr(0, matcher_.needle_length()); }
  395. iterator &operator++() noexcept {
  396. remaining_.remove_prefix(matcher_.skip_length());
  397. auto position = matcher_(remaining_);
  398. remaining_.remove_prefix(position != string_type::npos ? position : remaining_.size());
  399. return *this;
  400. }
  401. iterator operator++(int) noexcept {
  402. iterator temp = *this;
  403. ++(*this);
  404. return temp;
  405. }
  406. // Assumes both iterators point to the same underlying string.
  407. bool operator!=(iterator const &other) const noexcept { return remaining_.data() != other.remaining_.data(); }
  408. bool operator==(iterator const &other) const noexcept { return remaining_.data() == other.remaining_.data(); }
  409. bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); }
  410. bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); }
  411. };
  412. iterator begin() const noexcept { return {haystack_, matcher_}; }
  413. iterator end() const noexcept { return {string_type {haystack_.data() + haystack_.size(), 0ull}, matcher_}; }
  414. size_type size() const noexcept { return static_cast<size_type>(ssize()); }
  415. difference_type ssize() const noexcept { return std::distance(begin(), end()); }
  416. bool empty() const noexcept { return begin() == end_sentinel_type {}; }
  417. bool include_overlaps() const noexcept { return matcher_.skip_length() < matcher_.needle_length(); }
  418. /**
  419. * @brief Copies the matches into a container.
  420. */
  421. template <typename container_>
  422. void to(container_ &container) {
  423. for (auto match : *this) { container.push_back(match); }
  424. }
  425. /**
  426. * @brief Copies the matches into a consumed container, returning it at the end.
  427. */
  428. template <typename container_>
  429. container_ to() {
  430. return container_ {begin(), end()};
  431. }
  432. };
  433. /**
  434. * @brief A range of string slices representing the matches of a @b reverse-order substring search.
  435. * Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
  436. * Similar to a pair of `boost::algorithm::find_iterator`.
  437. */
  438. template <typename string_type_, typename matcher_type_>
  439. class range_rmatches {
  440. public:
  441. using string_type = string_type_;
  442. using matcher_type = matcher_type_;
  443. using size_type = std::size_t;
  444. using difference_type = std::ptrdiff_t;
  445. using value_type = string_type;
  446. using pointer = string_type; // Needed for compatibility with STL container constructors.
  447. using reference = string_type; // Needed for compatibility with STL container constructors.
  448. private:
  449. matcher_type matcher_;
  450. string_type haystack_;
  451. public:
  452. range_rmatches(string_type haystack, matcher_type needle) : matcher_(needle), haystack_(haystack) {}
  453. class iterator {
  454. matcher_type matcher_;
  455. string_type remaining_;
  456. public:
  457. using iterator_category = std::forward_iterator_tag;
  458. using difference_type = std::ptrdiff_t;
  459. using value_type = string_type;
  460. using pointer = string_type; // Needed for compatibility with STL container constructors.
  461. using reference = string_type; // Needed for compatibility with STL container constructors.
  462. iterator(string_type haystack, matcher_type matcher) noexcept : matcher_(matcher), remaining_(haystack) {
  463. auto position = matcher_(remaining_);
  464. remaining_.remove_suffix(position != string_type::npos
  465. ? remaining_.size() - position - matcher_.needle_length()
  466. : remaining_.size());
  467. }
  468. pointer operator->() const noexcept = delete;
  469. value_type operator*() const noexcept {
  470. return remaining_.substr(remaining_.size() - matcher_.needle_length());
  471. }
  472. iterator &operator++() noexcept {
  473. remaining_.remove_suffix(matcher_.skip_length());
  474. auto position = matcher_(remaining_);
  475. remaining_.remove_suffix(position != string_type::npos
  476. ? remaining_.size() - position - matcher_.needle_length()
  477. : remaining_.size());
  478. return *this;
  479. }
  480. iterator operator++(int) noexcept {
  481. iterator temp = *this;
  482. ++(*this);
  483. return temp;
  484. }
  485. // Assumes both iterators point to the same underlying string.
  486. // This has to be `.data() + .size()`, to be compatible with `std::string_view` on MSVC.
  487. bool operator!=(iterator const &other) const noexcept {
  488. return remaining_.data() + remaining_.size() != other.remaining_.data() + other.remaining_.size();
  489. }
  490. bool operator==(iterator const &other) const noexcept {
  491. return remaining_.data() + remaining_.size() == other.remaining_.data() + other.remaining_.size();
  492. }
  493. bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); }
  494. bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); }
  495. };
  496. iterator begin() const noexcept { return {haystack_, matcher_}; }
  497. iterator end() const noexcept { return {string_type {haystack_.data(), 0ull}, matcher_}; }
  498. size_type size() const noexcept { return static_cast<size_type>(ssize()); }
  499. difference_type ssize() const noexcept { return std::distance(begin(), end()); }
  500. bool empty() const noexcept { return begin() == end_sentinel_type {}; }
  501. bool include_overlaps() const noexcept { return matcher_.skip_length() < matcher_.needle_length(); }
  502. /**
  503. * @brief Copies the matches into a container.
  504. */
  505. template <typename container_>
  506. void to(container_ &container) {
  507. for (auto match : *this) { container.push_back(match); }
  508. }
  509. /**
  510. * @brief Copies the matches into a consumed container, returning it at the end.
  511. */
  512. template <typename container_>
  513. container_ to() {
  514. return container_ {begin(), end()};
  515. }
  516. };
  517. /**
  518. * @brief A range of string slices for different splits of the data.
  519. * Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
  520. * Similar to a pair of `boost::algorithm::split_iterator`.
  521. *
  522. * In some sense, represents the inverse operation to `range_matches`, as it reports not the search matches
  523. * but the data between them. Meaning that for `N` search matches, there will be `N+1` elements in the range.
  524. * Unlike ::range_matches, this range can't be empty. It also can't report overlapping intervals.
  525. */
  526. template <typename string_type_, typename matcher_type_>
  527. class range_splits {
  528. public:
  529. using string_type = string_type_;
  530. using matcher_type = matcher_type_;
  531. using size_type = std::size_t;
  532. using difference_type = std::ptrdiff_t;
  533. using value_type = string_type;
  534. using pointer = string_type; // Needed for compatibility with STL container constructors.
  535. using reference = string_type; // Needed for compatibility with STL container constructors.
  536. private:
  537. matcher_type matcher_;
  538. string_type haystack_;
  539. public:
  540. range_splits(string_type haystack, matcher_type needle) noexcept : matcher_(needle), haystack_(haystack) {}
  541. class iterator {
  542. matcher_type matcher_;
  543. string_type remaining_;
  544. std::size_t length_within_remaining_;
  545. bool reached_tail_;
  546. public:
  547. using iterator_category = std::forward_iterator_tag;
  548. using difference_type = std::ptrdiff_t;
  549. using value_type = string_type;
  550. using pointer = string_type; // Needed for compatibility with STL container constructors.
  551. using reference = string_type; // Needed for compatibility with STL container constructors.
  552. iterator(string_type haystack, matcher_type matcher) noexcept : matcher_(matcher), remaining_(haystack) {
  553. auto position = matcher_(remaining_);
  554. length_within_remaining_ = position != string_type::npos ? position : remaining_.size();
  555. reached_tail_ = false;
  556. }
  557. iterator(string_type haystack, matcher_type matcher, end_sentinel_type) noexcept
  558. : matcher_(matcher), remaining_(haystack), length_within_remaining_(0), reached_tail_(true) {}
  559. pointer operator->() const noexcept = delete;
  560. value_type operator*() const noexcept { return remaining_.substr(0, length_within_remaining_); }
  561. iterator &operator++() noexcept {
  562. remaining_.remove_prefix(length_within_remaining_);
  563. reached_tail_ = remaining_.empty();
  564. remaining_.remove_prefix(matcher_.needle_length() * !reached_tail_);
  565. auto position = matcher_(remaining_);
  566. length_within_remaining_ = position != string_type::npos ? position : remaining_.size();
  567. return *this;
  568. }
  569. iterator operator++(int) noexcept {
  570. iterator temp = *this;
  571. ++(*this);
  572. return temp;
  573. }
  574. bool operator!=(iterator const &other) const noexcept {
  575. return (remaining_.begin() != other.remaining_.begin()) || (reached_tail_ != other.reached_tail_);
  576. }
  577. bool operator==(iterator const &other) const noexcept {
  578. return (remaining_.begin() == other.remaining_.begin()) && (reached_tail_ == other.reached_tail_);
  579. }
  580. bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty() || !reached_tail_; }
  581. bool operator==(end_sentinel_type) const noexcept { return remaining_.empty() && reached_tail_; }
  582. bool is_last() const noexcept { return remaining_.size() == length_within_remaining_; }
  583. };
  584. iterator begin() const noexcept { return {haystack_, matcher_}; }
  585. iterator end() const noexcept { return {string_type {haystack_.end(), 0}, matcher_, end_sentinel_type {}}; }
  586. size_type size() const noexcept { return static_cast<size_type>(ssize()); }
  587. difference_type ssize() const noexcept { return std::distance(begin(), end()); }
  588. constexpr bool empty() const noexcept { return false; }
  589. /**
  590. * @brief Copies the matches into a container.
  591. */
  592. template <typename container_>
  593. void to(container_ &container) {
  594. for (auto match : *this) { container.push_back(match); }
  595. }
  596. /**
  597. * @brief Copies the matches into a consumed container, returning it at the end.
  598. */
  599. template <typename container_>
  600. container_ to(container_ &&container = {}) {
  601. for (auto match : *this) { container.push_back(match); }
  602. return std::move(container);
  603. }
  604. };
  605. /**
  606. * @brief A range of string slices for different splits of the data in @b reverse-order.
  607. * Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
  608. * Similar to a pair of `boost::algorithm::split_iterator`.
  609. *
  610. * In some sense, represents the inverse operation to `range_matches`, as it reports not the search matches
  611. * but the data between them. Meaning that for `N` search matches, there will be `N+1` elements in the range.
  612. * Unlike ::range_matches, this range can't be empty. It also can't report overlapping intervals.
  613. */
  614. template <typename string_type_, typename matcher_type_>
  615. class range_rsplits {
  616. public:
  617. using string_type = string_type_;
  618. using matcher_type = matcher_type_;
  619. using size_type = std::size_t;
  620. using difference_type = std::ptrdiff_t;
  621. using value_type = string_type;
  622. using pointer = string_type; // Needed for compatibility with STL container constructors.
  623. using reference = string_type; // Needed for compatibility with STL container constructors.
  624. private:
  625. matcher_type matcher_;
  626. string_type haystack_;
  627. public:
  628. range_rsplits(string_type haystack, matcher_type needle) noexcept : matcher_(needle), haystack_(haystack) {}
  629. class iterator {
  630. matcher_type matcher_;
  631. string_type remaining_;
  632. std::size_t length_within_remaining_;
  633. bool reached_tail_;
  634. public:
  635. using iterator_category = std::forward_iterator_tag;
  636. using difference_type = std::ptrdiff_t;
  637. using value_type = string_type;
  638. using pointer = string_type; // Needed for compatibility with STL container constructors.
  639. using reference = string_type; // Needed for compatibility with STL container constructors.
  640. iterator(string_type haystack, matcher_type matcher) noexcept : matcher_(matcher), remaining_(haystack) {
  641. auto position = matcher_(remaining_);
  642. length_within_remaining_ = position != string_type::npos
  643. ? remaining_.size() - position - matcher_.needle_length()
  644. : remaining_.size();
  645. reached_tail_ = false;
  646. }
  647. iterator(string_type haystack, matcher_type matcher, end_sentinel_type) noexcept
  648. : matcher_(matcher), remaining_(haystack), length_within_remaining_(0), reached_tail_(true) {}
  649. pointer operator->() const noexcept = delete;
  650. value_type operator*() const noexcept {
  651. return remaining_.substr(remaining_.size() - length_within_remaining_);
  652. }
  653. iterator &operator++() noexcept {
  654. remaining_.remove_suffix(length_within_remaining_);
  655. reached_tail_ = remaining_.empty();
  656. remaining_.remove_suffix(matcher_.needle_length() * !reached_tail_);
  657. auto position = matcher_(remaining_);
  658. length_within_remaining_ = position != string_type::npos
  659. ? remaining_.size() - position - matcher_.needle_length()
  660. : remaining_.size();
  661. return *this;
  662. }
  663. iterator operator++(int) noexcept {
  664. iterator temp = *this;
  665. ++(*this);
  666. return temp;
  667. }
  668. bool operator!=(iterator const &other) const noexcept {
  669. return (remaining_.end() != other.remaining_.end()) || (reached_tail_ != other.reached_tail_);
  670. }
  671. bool operator==(iterator const &other) const noexcept {
  672. return (remaining_.end() == other.remaining_.end()) && (reached_tail_ == other.reached_tail_);
  673. }
  674. bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty() || !reached_tail_; }
  675. bool operator==(end_sentinel_type) const noexcept { return remaining_.empty() && reached_tail_; }
  676. bool is_last() const noexcept { return remaining_.size() == length_within_remaining_; }
  677. };
  678. iterator begin() const noexcept { return {haystack_, matcher_}; }
  679. iterator end() const noexcept { return {{haystack_.data(), 0ull}, matcher_, end_sentinel_type {}}; }
  680. size_type size() const noexcept { return static_cast<size_type>(ssize()); }
  681. difference_type ssize() const noexcept { return std::distance(begin(), end()); }
  682. constexpr bool empty() const noexcept { return false; }
  683. /**
  684. * @brief Copies the matches into a container.
  685. */
  686. template <typename container_>
  687. void to(container_ &container) {
  688. for (auto match : *this) { container.push_back(match); }
  689. }
  690. /**
  691. * @brief Copies the matches into a consumed container, returning it at the end.
  692. */
  693. template <typename container_>
  694. container_ to(container_ &&container = {}) {
  695. for (auto match : *this) { container.push_back(match); }
  696. return std::move(container);
  697. }
  698. };
  699. /**
  700. * @brief Find all potentially @b overlapping inclusions of a needle substring.
  701. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  702. */
  703. template <typename string>
  704. range_matches<string, matcher_find<string, include_overlaps_type>> find_all(string const &h, string const &n,
  705. include_overlaps_type = {}) noexcept {
  706. return {h, n};
  707. }
  708. /**
  709. * @brief Find all potentially @b overlapping inclusions of a needle substring in @b reverse order.
  710. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  711. */
  712. template <typename string>
  713. range_rmatches<string, matcher_rfind<string, include_overlaps_type>> rfind_all(string const &h, string const &n,
  714. include_overlaps_type = {}) noexcept {
  715. return {h, n};
  716. }
  717. /**
  718. * @brief Find all @b non-overlapping inclusions of a needle substring.
  719. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  720. */
  721. template <typename string>
  722. range_matches<string, matcher_find<string, exclude_overlaps_type>> find_all(string const &h, string const &n,
  723. exclude_overlaps_type) noexcept {
  724. return {h, n};
  725. }
  726. /**
  727. * @brief Find all @b non-overlapping inclusions of a needle substring in @b reverse order.
  728. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  729. */
  730. template <typename string>
  731. range_rmatches<string, matcher_rfind<string, exclude_overlaps_type>> rfind_all(string const &h, string const &n,
  732. exclude_overlaps_type) noexcept {
  733. return {h, n};
  734. }
  735. /**
  736. * @brief Find all inclusions of characters from the second string.
  737. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  738. */
  739. template <typename string>
  740. range_matches<string, matcher_find_first_of<string>> find_all_characters(string const &h, string const &n) noexcept {
  741. return {h, n};
  742. }
  743. /**
  744. * @brief Find all inclusions of characters from the second string in @b reverse order.
  745. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  746. */
  747. template <typename string>
  748. range_rmatches<string, matcher_find_last_of<string>> rfind_all_characters(string const &h, string const &n) noexcept {
  749. return {h, n};
  750. }
  751. /**
  752. * @brief Find all characters except the ones in the second string.
  753. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  754. */
  755. template <typename string>
  756. range_matches<string, matcher_find_first_not_of<string>> find_all_other_characters(string const &h,
  757. string const &n) noexcept {
  758. return {h, n};
  759. }
  760. /**
  761. * @brief Find all characters except the ones in the second string in @b reverse order.
  762. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  763. */
  764. template <typename string>
  765. range_rmatches<string, matcher_find_last_not_of<string>> rfind_all_other_characters(string const &h,
  766. string const &n) noexcept {
  767. return {h, n};
  768. }
  769. /**
  770. * @brief Splits a string around every @b non-overlapping inclusion of the second string.
  771. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  772. */
  773. template <typename string>
  774. range_splits<string, matcher_find<string, exclude_overlaps_type>> split(string const &h, string const &n) noexcept {
  775. return {h, n};
  776. }
  777. /**
  778. * @brief Splits a string around every @b non-overlapping inclusion of the second string in @b reverse order.
  779. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  780. */
  781. template <typename string>
  782. range_rsplits<string, matcher_rfind<string, exclude_overlaps_type>> rsplit(string const &h, string const &n) noexcept {
  783. return {h, n};
  784. }
  785. /**
  786. * @brief Splits a string around every character from the second string.
  787. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  788. */
  789. template <typename string>
  790. range_splits<string, matcher_find_first_of<string>> split_characters(string const &h, string const &n) noexcept {
  791. return {h, n};
  792. }
  793. /**
  794. * @brief Splits a string around every character from the second string in @b reverse order.
  795. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  796. */
  797. template <typename string>
  798. range_rsplits<string, matcher_find_last_of<string>> rsplit_characters(string const &h, string const &n) noexcept {
  799. return {h, n};
  800. }
  801. /**
  802. * @brief Splits a string around every character except the ones from the second string.
  803. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  804. */
  805. template <typename string>
  806. range_splits<string, matcher_find_first_not_of<string>> split_other_characters(string const &h,
  807. string const &n) noexcept {
  808. return {h, n};
  809. }
  810. /**
  811. * @brief Splits a string around every character except the ones from the second string in @b reverse order.
  812. * @tparam string A string-like type, ideally a view, like StringZilla or STL `string_view`.
  813. */
  814. template <typename string>
  815. range_rsplits<string, matcher_find_last_not_of<string>> rsplit_other_characters(string const &h,
  816. string const &n) noexcept {
  817. return {h, n};
  818. }
  819. /** @brief Helper function using `std::advance` iterator and return it back. */
  820. template <typename iterator_type, typename distance_type>
  821. iterator_type advanced(iterator_type &&it, distance_type n) {
  822. std::advance(it, n);
  823. return it;
  824. }
  825. /** @brief Helper function using `range_length` to compute the unsigned distance. */
  826. template <typename iterator_type>
  827. std::size_t range_length(iterator_type first, iterator_type last) {
  828. return static_cast<std::size_t>(std::distance(first, last));
  829. }
  830. #pragma endregion
  831. #pragma region Global Operations with Dynamic Memory
  832. template <typename allocator_type_>
  833. static void *_call_allocate(sz_size_t n, void *allocator_state) noexcept {
  834. return reinterpret_cast<allocator_type_ *>(allocator_state)->allocate(n);
  835. }
  836. template <typename allocator_type_>
  837. static void _call_free(void *ptr, sz_size_t n, void *allocator_state) noexcept {
  838. return reinterpret_cast<allocator_type_ *>(allocator_state)->deallocate(reinterpret_cast<char *>(ptr), n);
  839. }
  840. template <typename generator_type_>
  841. static sz_u64_t _call_random_generator(void *state) noexcept {
  842. generator_type_ &generator = *reinterpret_cast<generator_type_ *>(state);
  843. return generator();
  844. }
  845. template <typename allocator_type_, typename allocator_callback_>
  846. static bool _with_alloc(allocator_type_ &allocator, allocator_callback_ &&callback) noexcept {
  847. sz_memory_allocator_t alloc;
  848. alloc.allocate = &_call_allocate<allocator_type_>;
  849. alloc.free = &_call_free<allocator_type_>;
  850. alloc.handle = &allocator;
  851. return callback(alloc);
  852. }
  853. template <typename allocator_type_, typename allocator_callback_>
  854. static bool _with_alloc(allocator_callback_ &&callback) noexcept {
  855. allocator_type_ allocator;
  856. return _with_alloc(allocator, std::forward<allocator_callback_>(callback));
  857. }
  858. #pragma endregion
  859. #pragma region Helper Template Classes
  860. /**
  861. * @brief A result of split a string once, containing the string slice ::before,
  862. * the ::match itself, and the slice ::after.
  863. */
  864. template <typename string_>
  865. struct string_partition_result {
  866. string_ before;
  867. string_ match;
  868. string_ after;
  869. };
  870. /**
  871. * @brief A reverse iterator for mutable and immutable character buffers.
  872. * Replaces `std::reverse_iterator` to avoid including `<iterator>`.
  873. */
  874. template <typename value_type_>
  875. class reversed_iterator_for {
  876. public:
  877. using iterator_category = std::random_access_iterator_tag;
  878. using value_type = value_type_;
  879. using difference_type = std::ptrdiff_t;
  880. using pointer = value_type_ *;
  881. using reference = value_type_ &;
  882. reversed_iterator_for(pointer ptr) noexcept : ptr_(ptr) {}
  883. reference operator*() const noexcept { return *ptr_; }
  884. bool operator==(reversed_iterator_for const &other) const noexcept { return ptr_ == other.ptr_; }
  885. bool operator!=(reversed_iterator_for const &other) const noexcept { return ptr_ != other.ptr_; }
  886. reference operator[](difference_type n) const noexcept { return *(*this + n); }
  887. reversed_iterator_for operator+(difference_type n) const noexcept { return reversed_iterator_for(ptr_ - n); }
  888. reversed_iterator_for operator-(difference_type n) const noexcept { return reversed_iterator_for(ptr_ + n); }
  889. difference_type operator-(reversed_iterator_for const &other) const noexcept { return other.ptr_ - ptr_; }
  890. reversed_iterator_for &operator++() noexcept {
  891. --ptr_;
  892. return *this;
  893. }
  894. reversed_iterator_for operator++(int) const noexcept {
  895. reversed_iterator_for temp = *this;
  896. --ptr_;
  897. return temp;
  898. }
  899. reversed_iterator_for &operator--() const noexcept {
  900. ++ptr_;
  901. return *this;
  902. }
  903. reversed_iterator_for operator--(int) const noexcept {
  904. reversed_iterator_for temp = *this;
  905. ++ptr_;
  906. return temp;
  907. }
  908. private:
  909. value_type_ *ptr_;
  910. };
  911. /**
  912. * @brief An "expression template" for lazy concatenation of strings using the `operator|`.
  913. */
  914. template <typename first_type, typename second_type>
  915. struct concatenation {
  916. using value_type = typename first_type::value_type;
  917. using pointer = value_type *;
  918. using const_pointer = value_type const *;
  919. using size_type = typename first_type::size_type;
  920. using difference_type = typename first_type::difference_type;
  921. first_type first;
  922. second_type second;
  923. std::size_t size() const noexcept { return first.size() + second.size(); }
  924. std::size_t length() const noexcept { return first.size() + second.size(); }
  925. size_type copy(pointer destination) const noexcept {
  926. first.copy(destination);
  927. second.copy(destination + first.length());
  928. return first.length() + second.length();
  929. }
  930. size_type copy(pointer destination, size_type length) const noexcept {
  931. auto first_length = std::min(first.length(), length);
  932. auto second_length = std::min(second.length(), length - first_length);
  933. first.copy(destination, first_length);
  934. second.copy(destination + first_length, second_length);
  935. return first_length + second_length;
  936. }
  937. template <typename last_type>
  938. concatenation<concatenation<first_type, second_type>, last_type> operator|(last_type &&last) const {
  939. return {*this, last};
  940. }
  941. };
  942. #pragma endregion
  943. #pragma region String Views/Spans
  944. /**
  945. * @brief A string slice (view/span) class implementing a superset of C++23 functionality
  946. * with much faster SIMD-accelerated substring search and approximate matching.
  947. * Constructors are `constexpr` enabling `_sz` literals.
  948. *
  949. * @tparam char_type_ The character type, usually `char const` or `char`. Must be a single byte long.
  950. */
  951. template <typename char_type_>
  952. class basic_string_slice {
  953. static_assert(sizeof(char_type_) == 1, "Characters must be a single byte long");
  954. static_assert(std::is_reference<char_type_>::value == false, "Characters can't be references");
  955. using char_type = char_type_;
  956. using mutable_char_type = typename std::remove_const<char_type_>::type;
  957. using immutable_char_type = typename std::add_const<char_type_>::type;
  958. char_type *start_;
  959. std::size_t length_;
  960. public:
  961. // STL compatibility
  962. using traits_type = std::char_traits<char_type_>;
  963. using value_type = mutable_char_type;
  964. using pointer = char_type *;
  965. using const_pointer = immutable_char_type *;
  966. using reference = char_type &;
  967. using const_reference = immutable_char_type &;
  968. using const_iterator = const_pointer;
  969. using iterator = pointer;
  970. using reverse_iterator = reversed_iterator_for<char_type>;
  971. using const_reverse_iterator = reversed_iterator_for<immutable_char_type>;
  972. using size_type = std::size_t;
  973. using difference_type = std::ptrdiff_t;
  974. // Non-STL type definitions
  975. using string_slice = basic_string_slice<char_type>;
  976. using string_span = basic_string_slice<mutable_char_type>;
  977. using string_view = basic_string_slice<immutable_char_type>;
  978. using partition_type = string_partition_result<string_slice>;
  979. /** @brief Special value for missing matches.
  980. *
  981. * We take the largest 63-bit unsigned integer on 64-bit machines.
  982. * We take the largest 31-bit unsigned integer on 32-bit machines.
  983. */
  984. static constexpr size_type npos = SZ_SSIZE_MAX;
  985. #pragma region Constructors and STL Utilities
  986. constexpr basic_string_slice() noexcept : start_(nullptr), length_(0) {}
  987. constexpr basic_string_slice(pointer c_string) noexcept
  988. : start_(c_string), length_(null_terminated_length(c_string)) {}
  989. constexpr basic_string_slice(pointer c_string, size_type length) noexcept : start_(c_string), length_(length) {}
  990. sz_constexpr_if_cpp20 basic_string_slice(basic_string_slice const &other) noexcept = default;
  991. sz_constexpr_if_cpp20 basic_string_slice &operator=(basic_string_slice const &other) noexcept = default;
  992. basic_string_slice(std::nullptr_t) = delete;
  993. /** @brief Exchanges the view with that of the `other`. */
  994. void swap(string_slice &other) noexcept { std::swap(start_, other.start_), std::swap(length_, other.length_); }
  995. #if !SZ_AVOID_STL
  996. template <typename sfinae_ = char_type, typename std::enable_if<std::is_const<sfinae_>::value, int>::type = 0>
  997. sz_constexpr_if_cpp20 basic_string_slice(std::string const &other) noexcept
  998. : basic_string_slice(other.data(), other.size()) {}
  999. template <typename sfinae_ = char_type, typename std::enable_if<!std::is_const<sfinae_>::value, int>::type = 0>
  1000. sz_constexpr_if_cpp20 basic_string_slice(std::string &other) noexcept
  1001. : basic_string_slice(&other[0], other.size()) {} // The `.data()` has mutable variant only since C++17
  1002. template <typename sfinae_ = char_type, typename std::enable_if<std::is_const<sfinae_>::value, int>::type = 0>
  1003. sz_constexpr_if_cpp20 string_slice &operator=(std::string const &other) noexcept {
  1004. return assign({other.data(), other.size()});
  1005. }
  1006. template <typename sfinae_ = char_type, typename std::enable_if<!std::is_const<sfinae_>::value, int>::type = 0>
  1007. sz_constexpr_if_cpp20 string_slice &operator=(std::string &other) noexcept {
  1008. return assign({other.data(), other.size()});
  1009. }
  1010. operator std::string() const { return {data(), size()}; }
  1011. /**
  1012. * @brief Formatted output function for compatibility with STL's `std::basic_ostream`.
  1013. * @throw `std::ios_base::failure` if an exception occurred during output.
  1014. */
  1015. template <typename stream_traits>
  1016. friend std::basic_ostream<value_type, stream_traits> &operator<<(std::basic_ostream<value_type, stream_traits> &os,
  1017. string_slice const &str) noexcept(false) {
  1018. return os.write(str.data(), str.size());
  1019. }
  1020. #if SZ_DETECT_CPP_17 && __cpp_lib_string_view
  1021. template <typename sfinae_ = char_type, typename std::enable_if<std::is_const<sfinae_>::value, int>::type = 0>
  1022. sz_constexpr_if_cpp20 basic_string_slice(std::string_view const &other) noexcept
  1023. : basic_string_slice(other.data(), other.size()) {}
  1024. template <typename sfinae_ = char_type, typename std::enable_if<std::is_const<sfinae_>::value, int>::type = 0>
  1025. sz_constexpr_if_cpp20 string_slice &operator=(std::string_view const &other) noexcept {
  1026. return assign({other.data(), other.size()});
  1027. }
  1028. operator std::string_view() const noexcept { return {data(), size()}; }
  1029. #endif
  1030. #endif
  1031. #pragma endregion
  1032. #pragma region Iterators and Element Access
  1033. iterator begin() const noexcept { return iterator(start_); }
  1034. iterator end() const noexcept { return iterator(start_ + length_); }
  1035. const_iterator cbegin() const noexcept { return const_iterator(start_); }
  1036. const_iterator cend() const noexcept { return const_iterator(start_ + length_); }
  1037. reverse_iterator rbegin() const noexcept { return reverse_iterator(start_ + length_ - 1); }
  1038. reverse_iterator rend() const noexcept { return reverse_iterator(start_ - 1); }
  1039. const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(start_ + length_ - 1); }
  1040. const_reverse_iterator crend() const noexcept { return const_reverse_iterator(start_ - 1); }
  1041. reference operator[](size_type pos) const noexcept { return start_[pos]; }
  1042. reference at(size_type pos) const noexcept { return start_[pos]; }
  1043. reference front() const noexcept { return start_[0]; }
  1044. reference back() const noexcept { return start_[length_ - 1]; }
  1045. pointer data() const noexcept { return start_; }
  1046. difference_type ssize() const noexcept { return static_cast<difference_type>(length_); }
  1047. size_type size() const noexcept { return length_; }
  1048. size_type length() const noexcept { return length_; }
  1049. size_type max_size() const noexcept { return npos - 1; }
  1050. bool empty() const noexcept { return length_ == 0; }
  1051. #pragma endregion
  1052. #pragma region Slicing
  1053. #pragma region Safe and Signed Extensions
  1054. /**
  1055. * @brief Equivalent to Python's `"abc"[-3:-1]`. Exception-safe, unlike STL's `substr`.
  1056. * Supports signed and unsigned intervals.
  1057. */
  1058. string_slice operator[](std::initializer_list<difference_type> signed_offsets) const noexcept {
  1059. assert(signed_offsets.size() == 2 && "operator[] can't take more than 2 offsets");
  1060. return sub(signed_offsets.begin()[0], signed_offsets.begin()[1]);
  1061. }
  1062. /**
  1063. * @brief Signed alternative to `at()`. Handy if you often write `str[str.size() - 2]`.
  1064. * @warning The behavior is @b undefined if the position is beyond bounds.
  1065. */
  1066. reference sat(difference_type signed_offset) const noexcept {
  1067. size_type pos = (signed_offset < 0) ? size() + signed_offset : signed_offset;
  1068. assert(pos < size() && "string_slice::sat(i) out of bounds");
  1069. return start_[pos];
  1070. }
  1071. /**
  1072. * @brief The opposite operation to `remove_prefix`, that does no bounds checking.
  1073. * @warning The behavior is @b undefined if `n > size()`.
  1074. */
  1075. string_slice front(size_type n) const noexcept {
  1076. assert(n <= size() && "string_slice::front(n) out of bounds");
  1077. return {start_, n};
  1078. }
  1079. /**
  1080. * @brief The opposite operation to `remove_prefix`, that does no bounds checking.
  1081. * @warning The behavior is @b undefined if `n > size()`.
  1082. */
  1083. string_slice back(size_type n) const noexcept {
  1084. assert(n <= size() && "string_slice::back(n) out of bounds");
  1085. return {start_ + length_ - n, n};
  1086. }
  1087. /**
  1088. * @brief Equivalent to Python's `"abc"[-3:-1]`. Exception-safe, unlike STL's `substr`.
  1089. * Supports signed and unsigned intervals.
  1090. */
  1091. string_slice sub(difference_type signed_start_offset, difference_type signed_end_offset = npos) const noexcept {
  1092. sz_size_t normalized_offset, normalized_length;
  1093. sz_ssize_clamp_interval(length_, signed_start_offset, signed_end_offset, &normalized_offset,
  1094. &normalized_length);
  1095. return string_slice(start_ + normalized_offset, normalized_length);
  1096. }
  1097. /**
  1098. * @brief Exports this entire view. Not an STL function, but useful for concatenations.
  1099. * The STL variant expects at least two arguments.
  1100. */
  1101. size_type copy(value_type *destination) const noexcept {
  1102. sz_copy((sz_ptr_t)destination, start_, length_);
  1103. return length_;
  1104. }
  1105. #pragma endregion
  1106. #pragma region STL Style
  1107. /**
  1108. * @brief Removes the first `n` characters from the view.
  1109. * @warning The behavior is @b undefined if `n > size()`.
  1110. */
  1111. void remove_prefix(size_type n) noexcept { assert(n <= size()), start_ += n, length_ -= n; }
  1112. /**
  1113. * @brief Removes the last `n` characters from the view.
  1114. * @warning The behavior is @b undefined if `n > size()`.
  1115. */
  1116. void remove_suffix(size_type n) noexcept { assert(n <= size()), length_ -= n; }
  1117. /** @brief Added for STL compatibility. */
  1118. string_slice substr() const noexcept { return *this; }
  1119. /**
  1120. * @brief Return a slice of this view after first `skip` bytes.
  1121. * @throws `std::out_of_range` if `skip > size()`.
  1122. * @see `sub` for a cleaner exception-less alternative.
  1123. */
  1124. string_slice substr(size_type skip) const noexcept(false) {
  1125. if (skip > size()) throw std::out_of_range("string_slice::substr");
  1126. return string_slice(start_ + skip, length_ - skip);
  1127. }
  1128. /**
  1129. * @brief Return a slice of this view after first `skip` bytes, taking at most `count` bytes.
  1130. * @throws `std::out_of_range` if `skip > size()`.
  1131. * @see `sub` for a cleaner exception-less alternative.
  1132. */
  1133. string_slice substr(size_type skip, size_type count) const noexcept(false) {
  1134. if (skip > size()) throw std::out_of_range("string_slice::substr");
  1135. return string_slice(start_ + skip, sz_min_of_two(count, length_ - skip));
  1136. }
  1137. /**
  1138. * @brief Exports a slice of this view after first `skip` bytes, taking at most `count` bytes.
  1139. * @throws `std::out_of_range` if `skip > size()`.
  1140. * @see `sub` for a cleaner exception-less alternative.
  1141. */
  1142. size_type copy(value_type *destination, size_type count, size_type skip = 0) const noexcept(false) {
  1143. if (skip > size()) throw std::out_of_range("string_slice::copy");
  1144. count = sz_min_of_two(count, length_ - skip);
  1145. sz_copy((sz_ptr_t)destination, start_ + skip, count);
  1146. return count;
  1147. }
  1148. #pragma endregion
  1149. #pragma endregion
  1150. #pragma region Comparisons
  1151. #pragma region Whole String Comparisons
  1152. /**
  1153. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1154. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1155. */
  1156. int compare(string_view other) const noexcept {
  1157. return (int)sz_order(start_, length_, other.start_, other.length_);
  1158. }
  1159. /**
  1160. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1161. * Equivalent to `substr(pos1, count1).compare(other)`.
  1162. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1163. * @throw `std::out_of_range` if `pos1 > size()`.
  1164. */
  1165. int compare(size_type pos1, size_type count1, string_view other) const noexcept(false) {
  1166. return substr(pos1, count1).compare(other);
  1167. }
  1168. /**
  1169. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1170. * Equivalent to `substr(pos1, count1).compare(other.substr(pos2, count2))`.
  1171. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1172. * @throw `std::out_of_range` if `pos1 > size()` or if `pos2 > other.size()`.
  1173. */
  1174. int compare(size_type pos1, size_type count1, string_view other, size_type pos2, size_type count2) const
  1175. noexcept(false) {
  1176. return substr(pos1, count1).compare(other.substr(pos2, count2));
  1177. }
  1178. /**
  1179. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1180. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1181. */
  1182. int compare(const_pointer other) const noexcept { return compare(string_view(other)); }
  1183. /**
  1184. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1185. * Equivalent to substr(pos1, count1).compare(other).
  1186. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1187. * @throw `std::out_of_range` if `pos1 > size()`.
  1188. */
  1189. int compare(size_type pos1, size_type count1, const_pointer other) const noexcept(false) {
  1190. return substr(pos1, count1).compare(string_view(other));
  1191. }
  1192. /**
  1193. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1194. * Equivalent to `substr(pos1, count1).compare({s, count2})`.
  1195. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1196. * @throw `std::out_of_range` if `pos1 > size()`.
  1197. */
  1198. int compare(size_type pos1, size_type count1, const_pointer other, size_type count2) const noexcept(false) {
  1199. return substr(pos1, count1).compare(string_view(other, count2));
  1200. }
  1201. /** @brief Checks if the string is equal to the other string. */
  1202. bool operator==(string_view other) const noexcept {
  1203. return size() == other.size() && sz_equal(data(), other.data(), other.size()) == sz_true_k;
  1204. }
  1205. /** @brief Checks if the string is equal to a concatenation of two strings. */
  1206. bool operator==(concatenation<string_view, string_view> const &other) const noexcept {
  1207. return size() == other.size() && sz_equal(data(), other.first.data(), other.first.size()) == sz_true_k &&
  1208. sz_equal(data() + other.first.size(), other.second.data(), other.second.size()) == sz_true_k;
  1209. }
  1210. #if SZ_DETECT_CPP20
  1211. /** @brief Computes the lexicographic ordering between this and the ::other string. */
  1212. std::strong_ordering operator<=>(string_view other) const noexcept {
  1213. std::strong_ordering orders[3] {std::strong_ordering::less, std::strong_ordering::equal,
  1214. std::strong_ordering::greater};
  1215. return orders[compare(other) + 1];
  1216. }
  1217. #else
  1218. /** @brief Checks if the string is not equal to the other string. */
  1219. bool operator!=(string_view other) const noexcept { return !operator==(other); }
  1220. /** @brief Checks if the string is lexicographically smaller than the other string. */
  1221. bool operator<(string_view other) const noexcept { return compare(other) == sz_less_k; }
  1222. /** @brief Checks if the string is lexicographically equal or smaller than the other string. */
  1223. bool operator<=(string_view other) const noexcept { return compare(other) != sz_greater_k; }
  1224. /** @brief Checks if the string is lexicographically greater than the other string. */
  1225. bool operator>(string_view other) const noexcept { return compare(other) == sz_greater_k; }
  1226. /** @brief Checks if the string is lexicographically equal or greater than the other string. */
  1227. bool operator>=(string_view other) const noexcept { return compare(other) != sz_less_k; }
  1228. #endif
  1229. #pragma endregion
  1230. #pragma region Prefix and Suffix Comparisons
  1231. /** @brief Checks if the string starts with the other string. */
  1232. bool starts_with(string_view other) const noexcept {
  1233. return length_ >= other.length_ && sz_equal(start_, other.start_, other.length_) == sz_true_k;
  1234. }
  1235. /** @brief Checks if the string starts with the other string. */
  1236. bool starts_with(const_pointer other) const noexcept {
  1237. auto other_length = null_terminated_length(other);
  1238. return length_ >= other_length && sz_equal(start_, other, other_length) == sz_true_k;
  1239. }
  1240. /** @brief Checks if the string starts with the other character. */
  1241. bool starts_with(value_type other) const noexcept { return length_ && start_[0] == other; }
  1242. /** @brief Checks if the string ends with the other string. */
  1243. bool ends_with(string_view other) const noexcept {
  1244. return length_ >= other.length_ &&
  1245. sz_equal(start_ + length_ - other.length_, other.start_, other.length_) == sz_true_k;
  1246. }
  1247. /** @brief Checks if the string ends with the other string. */
  1248. bool ends_with(const_pointer other) const noexcept {
  1249. auto other_length = null_terminated_length(other);
  1250. return length_ >= other_length && sz_equal(start_ + length_ - other_length, other, other_length) == sz_true_k;
  1251. }
  1252. /** @brief Checks if the string ends with the other character. */
  1253. bool ends_with(value_type other) const noexcept { return length_ && start_[length_ - 1] == other; }
  1254. /** @brief Python-like convenience function, dropping the matching prefix. */
  1255. string_slice remove_prefix(string_view other) const noexcept {
  1256. return starts_with(other) ? string_slice {start_ + other.length_, length_ - other.length_} : *this;
  1257. }
  1258. /** @brief Python-like convenience function, dropping the matching suffix. */
  1259. string_slice remove_suffix(string_view other) const noexcept {
  1260. return ends_with(other) ? string_slice {start_, length_ - other.length_} : *this;
  1261. }
  1262. #pragma endregion
  1263. #pragma endregion
  1264. #pragma region Matching Substrings
  1265. bool contains(string_view other) const noexcept { return find(other) != npos; }
  1266. bool contains(value_type character) const noexcept { return find(character) != npos; }
  1267. bool contains(const_pointer other) const noexcept { return find(other) != npos; }
  1268. #pragma region Returning offsets
  1269. /**
  1270. * @brief Find the first occurrence of a substring, skipping the first `skip` characters.
  1271. * The behavior is @b undefined if `skip > size()`.
  1272. * @return The offset of the first character of the match, or `npos` if not found.
  1273. */
  1274. size_type find(string_view other, size_type skip = 0) const noexcept {
  1275. auto ptr = sz_find(start_ + skip, length_ - skip, other.start_, other.length_);
  1276. return ptr ? ptr - start_ : npos;
  1277. }
  1278. /**
  1279. * @brief Find the first occurrence of a character, skipping the first `skip` characters.
  1280. * The behavior is @b undefined if `skip > size()`.
  1281. * @return The offset of the match, or `npos` if not found.
  1282. */
  1283. size_type find(value_type character, size_type skip = 0) const noexcept {
  1284. auto ptr = sz_find_byte(start_ + skip, length_ - skip, &character);
  1285. return ptr ? ptr - start_ : npos;
  1286. }
  1287. /**
  1288. * @brief Find the first occurrence of a substring, skipping the first `skip` characters.
  1289. * The behavior is @b undefined if `skip > size()`.
  1290. * @return The offset of the first character of the match, or `npos` if not found.
  1291. */
  1292. size_type find(const_pointer other, size_type pos, size_type count) const noexcept {
  1293. return find(string_view(other, count), pos);
  1294. }
  1295. /**
  1296. * @brief Find the last occurrence of a substring.
  1297. * @return The offset of the first character of the match, or `npos` if not found.
  1298. */
  1299. size_type rfind(string_view other) const noexcept {
  1300. auto ptr = sz_rfind(start_, length_, other.start_, other.length_);
  1301. return ptr ? ptr - start_ : npos;
  1302. }
  1303. /**
  1304. * @brief Find the last occurrence of a substring, within first `until` characters.
  1305. * @return The offset of the first character of the match, or `npos` if not found.
  1306. */
  1307. size_type rfind(string_view other, size_type until) const noexcept(false) {
  1308. return until + other.size() < length_ ? substr(0, until + other.size()).rfind(other) : rfind(other);
  1309. }
  1310. /**
  1311. * @brief Find the last occurrence of a character.
  1312. * @return The offset of the match, or `npos` if not found.
  1313. */
  1314. size_type rfind(value_type character) const noexcept {
  1315. auto ptr = sz_rfind_byte(start_, length_, &character);
  1316. return ptr ? ptr - start_ : npos;
  1317. }
  1318. /**
  1319. * @brief Find the last occurrence of a character, within first `until` characters.
  1320. * @return The offset of the match, or `npos` if not found.
  1321. */
  1322. size_type rfind(value_type character, size_type until) const noexcept {
  1323. return until < length_ ? substr(0, until + 1).rfind(character) : rfind(character);
  1324. }
  1325. /**
  1326. * @brief Find the last occurrence of a substring, within first `until` characters.
  1327. * @return The offset of the first character of the match, or `npos` if not found.
  1328. */
  1329. size_type rfind(const_pointer other, size_type until, size_type count) const noexcept {
  1330. return rfind(string_view(other, count), until);
  1331. }
  1332. /** @brief Find the first occurrence of a character from a set. */
  1333. size_type find(char_set set) const noexcept { return find_first_of(set); }
  1334. /** @brief Find the last occurrence of a character from a set. */
  1335. size_type rfind(char_set set) const noexcept { return find_last_of(set); }
  1336. #pragma endregion
  1337. #pragma region Returning Partitions
  1338. /** @brief Split the string into three parts, before the match, the match itself, and after it. */
  1339. partition_type partition(string_view pattern) const noexcept { return partition_(pattern, pattern.length()); }
  1340. /** @brief Split the string into three parts, before the match, the match itself, and after it. */
  1341. partition_type partition(char_set pattern) const noexcept { return partition_(pattern, 1); }
  1342. /** @brief Split the string into three parts, before the @b last match, the last match itself, and after it. */
  1343. partition_type rpartition(string_view pattern) const noexcept { return rpartition_(pattern, pattern.length()); }
  1344. /** @brief Split the string into three parts, before the @b last match, the last match itself, and after it. */
  1345. partition_type rpartition(char_set pattern) const noexcept { return rpartition_(pattern, 1); }
  1346. #pragma endregion
  1347. #pragma endregion
  1348. #pragma region Matching Character Sets
  1349. // `isascii` is a macro in MSVC headers
  1350. bool contains_only(char_set set) const noexcept { return find_first_not_of(set) == npos; }
  1351. bool is_alpha() const noexcept { return !empty() && contains_only(ascii_letters_set()); }
  1352. bool is_alnum() const noexcept { return !empty() && contains_only(ascii_letters_set() | digits_set()); }
  1353. bool is_ascii() const noexcept { return empty() || contains_only(ascii_controls_set() | ascii_printables_set()); }
  1354. bool is_digit() const noexcept { return !empty() && contains_only(digits_set()); }
  1355. bool is_lower() const noexcept { return !empty() && contains_only(ascii_lowercase_set()); }
  1356. bool is_space() const noexcept { return !empty() && contains_only(whitespaces_set()); }
  1357. bool is_upper() const noexcept { return !empty() && contains_only(ascii_uppercase_set()); }
  1358. bool is_printable() const noexcept { return empty() || contains_only(ascii_printables_set()); }
  1359. #pragma region Character Set Arguments
  1360. /**
  1361. * @brief Find the first occurrence of a character from a set.
  1362. * @param skip Number of characters to skip before the search.
  1363. * @warning The behavior is @b undefined if `skip > size()`.
  1364. */
  1365. size_type find_first_of(char_set set, size_type skip = 0) const noexcept {
  1366. auto ptr = sz_find_charset(start_ + skip, length_ - skip, &set.raw());
  1367. return ptr ? ptr - start_ : npos;
  1368. }
  1369. /**
  1370. * @brief Find the first occurrence of a character outside a set.
  1371. * @param skip The number of first characters to be skipped.
  1372. * @warning The behavior is @b undefined if `skip > size()`.
  1373. */
  1374. size_type find_first_not_of(char_set set, size_type skip = 0) const noexcept {
  1375. return find_first_of(set.inverted(), skip);
  1376. }
  1377. /**
  1378. * @brief Find the last occurrence of a character from a set.
  1379. */
  1380. size_type find_last_of(char_set set) const noexcept {
  1381. auto ptr = sz_rfind_charset(start_, length_, &set.raw());
  1382. return ptr ? ptr - start_ : npos;
  1383. }
  1384. /**
  1385. * @brief Find the last occurrence of a character outside a set.
  1386. */
  1387. size_type find_last_not_of(char_set set) const noexcept { return find_last_of(set.inverted()); }
  1388. /**
  1389. * @brief Find the last occurrence of a character from a set.
  1390. * @param until The offset of the last character to be considered.
  1391. */
  1392. size_type find_last_of(char_set set, size_type until) const noexcept {
  1393. auto len = sz_min_of_two(until + 1, length_);
  1394. auto ptr = sz_rfind_charset(start_, len, &set.raw());
  1395. return ptr ? ptr - start_ : npos;
  1396. }
  1397. /**
  1398. * @brief Find the last occurrence of a character outside a set.
  1399. * @param until The offset of the last character to be considered.
  1400. */
  1401. size_type find_last_not_of(char_set set, size_type until) const noexcept {
  1402. return find_last_of(set.inverted(), until);
  1403. }
  1404. #pragma endregion
  1405. #pragma region String Arguments
  1406. /**
  1407. * @brief Find the first occurrence of a character from a ::set.
  1408. * @param skip The number of first characters to be skipped.
  1409. */
  1410. size_type find_first_of(string_view other, size_type skip = 0) const noexcept {
  1411. return find_first_of(other.as_set(), skip);
  1412. }
  1413. /**
  1414. * @brief Find the first occurrence of a character outside a ::set.
  1415. * @param skip The number of first characters to be skipped.
  1416. */
  1417. size_type find_first_not_of(string_view other, size_type skip = 0) const noexcept {
  1418. return find_first_not_of(other.as_set(), skip);
  1419. }
  1420. /**
  1421. * @brief Find the last occurrence of a character from a ::set.
  1422. * @param until The offset of the last character to be considered.
  1423. */
  1424. size_type find_last_of(string_view other, size_type until = npos) const noexcept {
  1425. return find_last_of(other.as_set(), until);
  1426. }
  1427. /**
  1428. * @brief Find the last occurrence of a character outside a ::set.
  1429. * @param until The offset of the last character to be considered.
  1430. */
  1431. size_type find_last_not_of(string_view other, size_type until = npos) const noexcept {
  1432. return find_last_not_of(other.as_set(), until);
  1433. }
  1434. #pragma endregion
  1435. #pragma region C-Style Arguments
  1436. /**
  1437. * @brief Find the first occurrence of a character from a set.
  1438. * @param skip The number of first characters to be skipped.
  1439. * @warning The behavior is @b undefined if `skip > size()`.
  1440. */
  1441. size_type find_first_of(const_pointer other, size_type skip, size_type count) const noexcept {
  1442. return find_first_of(string_view(other, count), skip);
  1443. }
  1444. /**
  1445. * @brief Find the first occurrence of a character outside a set.
  1446. * @param skip The number of first characters to be skipped.
  1447. * @warning The behavior is @b undefined if `skip > size()`.
  1448. */
  1449. size_type find_first_not_of(const_pointer other, size_type skip, size_type count) const noexcept {
  1450. return find_first_not_of(string_view(other, count), skip);
  1451. }
  1452. /**
  1453. * @brief Find the last occurrence of a character from a set.
  1454. * @param until The number of first characters to be considered.
  1455. */
  1456. size_type find_last_of(const_pointer other, size_type until, size_type count) const noexcept {
  1457. return find_last_of(string_view(other, count), until);
  1458. }
  1459. /**
  1460. * @brief Find the last occurrence of a character outside a set.
  1461. * @param until The number of first characters to be considered.
  1462. */
  1463. size_type find_last_not_of(const_pointer other, size_type until, size_type count) const noexcept {
  1464. return find_last_not_of(string_view(other, count), until);
  1465. }
  1466. #pragma endregion
  1467. #pragma region Slicing
  1468. /**
  1469. * @brief Python-like convenience function, dropping prefix formed of given characters.
  1470. * Similar to `boost::algorithm::trim_left_if(str, is_any_of(set))`.
  1471. */
  1472. string_slice lstrip(char_set set) const noexcept {
  1473. set = set.inverted();
  1474. auto new_start = sz_find_charset(start_, length_, &set.raw());
  1475. return new_start ? string_slice {new_start, length_ - static_cast<size_type>(new_start - start_)}
  1476. : string_slice();
  1477. }
  1478. /**
  1479. * @brief Python-like convenience function, dropping suffix formed of given characters.
  1480. * Similar to `boost::algorithm::trim_right_if(str, is_any_of(set))`.
  1481. */
  1482. string_slice rstrip(char_set set) const noexcept {
  1483. set = set.inverted();
  1484. auto new_end = sz_rfind_charset(start_, length_, &set.raw());
  1485. return new_end ? string_slice {start_, static_cast<size_type>(new_end - start_ + 1)} : string_slice();
  1486. }
  1487. /**
  1488. * @brief Python-like convenience function, dropping both the prefix & the suffix formed of given characters.
  1489. * Similar to `boost::algorithm::trim_if(str, is_any_of(set))`.
  1490. */
  1491. string_slice strip(char_set set) const noexcept {
  1492. set = set.inverted();
  1493. auto new_start = sz_find_charset(start_, length_, &set.raw());
  1494. return new_start ? string_slice {new_start,
  1495. static_cast<size_type>(
  1496. sz_rfind_charset(new_start, length_ - (new_start - start_), &set.raw()) -
  1497. new_start + 1)}
  1498. : string_slice();
  1499. }
  1500. #pragma endregion
  1501. #pragma endregion
  1502. #pragma region Search Ranges
  1503. using find_all_type = range_matches<string_slice, matcher_find<string_view, include_overlaps_type>>;
  1504. using rfind_all_type = range_rmatches<string_slice, matcher_rfind<string_view, include_overlaps_type>>;
  1505. using find_disjoint_type = range_matches<string_slice, matcher_find<string_view, exclude_overlaps_type>>;
  1506. using rfind_disjoint_type = range_rmatches<string_slice, matcher_rfind<string_view, exclude_overlaps_type>>;
  1507. using find_all_chars_type = range_matches<string_slice, matcher_find_first_of<string_view, char_set>>;
  1508. using rfind_all_chars_type = range_rmatches<string_slice, matcher_find_last_of<string_view, char_set>>;
  1509. /** @brief Find all potentially @b overlapping occurrences of a given string. */
  1510. find_all_type find_all(string_view needle, include_overlaps_type = {}) const noexcept { return {*this, needle}; }
  1511. /** @brief Find all potentially @b overlapping occurrences of a given string in @b reverse order. */
  1512. rfind_all_type rfind_all(string_view needle, include_overlaps_type = {}) const noexcept { return {*this, needle}; }
  1513. /** @brief Find all @b non-overlapping occurrences of a given string. */
  1514. find_disjoint_type find_all(string_view needle, exclude_overlaps_type) const noexcept { return {*this, needle}; }
  1515. /** @brief Find all @b non-overlapping occurrences of a given string in @b reverse order. */
  1516. rfind_disjoint_type rfind_all(string_view needle, exclude_overlaps_type) const noexcept { return {*this, needle}; }
  1517. /** @brief Find all occurrences of given characters. */
  1518. find_all_chars_type find_all(char_set set) const noexcept { return {*this, {set}}; }
  1519. /** @brief Find all occurrences of given characters in @b reverse order. */
  1520. rfind_all_chars_type rfind_all(char_set set) const noexcept { return {*this, {set}}; }
  1521. using split_type = range_splits<string_slice, matcher_find<string_view, exclude_overlaps_type>>;
  1522. using rsplit_type = range_rsplits<string_slice, matcher_rfind<string_view, exclude_overlaps_type>>;
  1523. using split_chars_type = range_splits<string_slice, matcher_find_first_of<string_view, char_set>>;
  1524. using rsplit_chars_type = range_rsplits<string_slice, matcher_find_last_of<string_view, char_set>>;
  1525. /** @brief Split around occurrences of a given string. */
  1526. split_type split(string_view delimiter) const noexcept { return {*this, delimiter}; }
  1527. /** @brief Split around occurrences of a given string in @b reverse order. */
  1528. rsplit_type rsplit(string_view delimiter) const noexcept { return {*this, delimiter}; }
  1529. /** @brief Split around occurrences of given characters. */
  1530. split_chars_type split(char_set set = whitespaces_set()) const noexcept { return {*this, {set}}; }
  1531. /** @brief Split around occurrences of given characters in @b reverse order. */
  1532. rsplit_chars_type rsplit(char_set set = whitespaces_set()) const noexcept { return {*this, {set}}; }
  1533. /** @brief Split around the occurrences of all newline characters. */
  1534. split_chars_type splitlines() const noexcept { return split(newlines_set); }
  1535. #pragma endregion
  1536. /** @brief Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
  1537. size_type hash() const noexcept { return static_cast<size_type>(sz_hash(start_, length_)); }
  1538. /** @brief Populate a character set with characters present in this string. */
  1539. char_set as_set() const noexcept {
  1540. char_set set;
  1541. for (auto c : *this) set.add(c);
  1542. return set;
  1543. }
  1544. private:
  1545. sz_constexpr_if_cpp20 string_view &assign(string_view const &other) noexcept {
  1546. start_ = other.start_;
  1547. length_ = other.length_;
  1548. return *this;
  1549. }
  1550. sz_constexpr_if_cpp20 static size_type null_terminated_length(const_pointer s) noexcept {
  1551. const_pointer p = s;
  1552. while (*p) ++p;
  1553. return p - s;
  1554. }
  1555. template <typename pattern_>
  1556. partition_type partition_(pattern_ &&pattern, std::size_t pattern_length) const noexcept {
  1557. size_type pos = find(pattern);
  1558. if (pos == npos) return {*this, string_view(), string_view()};
  1559. return {string_view(start_, pos), string_view(start_ + pos, pattern_length),
  1560. string_view(start_ + pos + pattern_length, length_ - pos - pattern_length)};
  1561. }
  1562. template <typename pattern_>
  1563. partition_type rpartition_(pattern_ &&pattern, std::size_t pattern_length) const noexcept {
  1564. size_type pos = rfind(pattern);
  1565. if (pos == npos) return {*this, string_view(), string_view()};
  1566. return {string_view(start_, pos), string_view(start_ + pos, pattern_length),
  1567. string_view(start_ + pos + pattern_length, length_ - pos - pattern_length)};
  1568. }
  1569. };
  1570. #pragma endregion
  1571. /**
  1572. * @brief Memory-owning string class with a Small String Optimization.
  1573. *
  1574. * @section API
  1575. *
  1576. * Some APIs are different from `basic_string_slice`:
  1577. * * `lstrip`, `rstrip`, `strip` modify the string in-place, instead of returning a new view.
  1578. * * `sat`, `sub`, and element access has non-const overloads returning references to mutable objects.
  1579. *
  1580. * Functions defined for `basic_string`, but not present in `basic_string_slice`:
  1581. * * `replace`, `insert`, `erase`, `append`, `push_back`, `pop_back`, `resize`, `shrink_to_fit`... from STL,
  1582. * * `try_` exception-free "try" operations that returning non-zero values on success,
  1583. * * `replace_all` and `erase_all` similar to Boost,
  1584. * * `edit_distance` - Levenshtein distance computation reusing the allocator,
  1585. * * `randomize`, `random` - for fast random string generation.
  1586. *
  1587. * Functions defined for `basic_string_slice`, but not present in `basic_string`:
  1588. * * `[r]partition`, `[r]split`, `[r]find_all` missing to enforce lifetime on long operations.
  1589. * * `remove_prefix`, `remove_suffix` for now.
  1590. *
  1591. * @section Exceptions
  1592. *
  1593. * Default constructor is `constexpr`. Move constructor and move assignment operator are `noexcept`.
  1594. * Copy constructor and copy assignment operator are not! They may throw `std::bad_alloc` if the memory
  1595. * allocation fails. Similar to STL `std::out_of_range` if the position argument to some of the functions
  1596. * is out of bounds. Same as with STL, the bound checks are often asymmetric, so pay attention to docs.
  1597. * If exceptions are disabled, on failure, `std::terminate` is called.
  1598. */
  1599. template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
  1600. class basic_string {
  1601. static_assert(sizeof(char_type_) == 1, "Characters must be a single byte long");
  1602. static_assert(std::is_reference<char_type_>::value == false, "Characters can't be references");
  1603. static_assert(std::is_const<char_type_>::value == false, "Characters must be mutable");
  1604. using char_type = char_type_;
  1605. using sz_alloc_type = sz_memory_allocator_t;
  1606. sz_string_t string_;
  1607. /**
  1608. * Stateful allocators and their support in C++ strings is extremely error-prone by design.
  1609. * Depending on traits like `propagate_on_container_copy_assignment` and `propagate_on_container_move_assignment`,
  1610. * its state will be copied from one string to another. It goes against the design of most string constructors,
  1611. * as they also receive allocator as the last argument!
  1612. */
  1613. static_assert(std::is_empty<allocator_type_>::value, "We currently only support stateless allocators");
  1614. template <typename allocator_callback>
  1615. static bool _with_alloc(allocator_callback &&callback) noexcept {
  1616. return ashvardanian::stringzilla::_with_alloc<allocator_type_>(callback);
  1617. }
  1618. bool is_internal() const noexcept { return sz_string_is_on_stack(&string_); }
  1619. void init(std::size_t length, char_type value) noexcept(false) {
  1620. sz_ptr_t start;
  1621. if (!_with_alloc(
  1622. [&](sz_alloc_type &alloc) { return (start = sz_string_init_length(&string_, length, &alloc)); }))
  1623. throw std::bad_alloc();
  1624. sz_fill(start, length, *(sz_u8_t *)&value);
  1625. }
  1626. void init(string_view other) noexcept(false) {
  1627. sz_ptr_t start;
  1628. if (!_with_alloc(
  1629. [&](sz_alloc_type &alloc) { return (start = sz_string_init_length(&string_, other.size(), &alloc)); }))
  1630. throw std::bad_alloc();
  1631. sz_copy(start, (sz_cptr_t)other.data(), other.size());
  1632. }
  1633. void move(basic_string &other) noexcept {
  1634. // We can't just assign the other string state, as its start address may be somewhere else on the stack.
  1635. sz_ptr_t string_start;
  1636. sz_size_t string_length;
  1637. sz_size_t string_space;
  1638. sz_bool_t string_is_external;
  1639. sz_string_unpack(&other.string_, &string_start, &string_length, &string_space, &string_is_external);
  1640. // Acquire the old string's value bitwise
  1641. *(&string_) = *(&other.string_);
  1642. // Reposition the string start pointer to the stack if it fits.
  1643. // Ternary condition may be optimized to a branchless version.
  1644. string_.internal.start = string_is_external ? string_.internal.start : &string_.internal.chars[0];
  1645. sz_string_init(&other.string_); // Discard the other string.
  1646. }
  1647. public:
  1648. // STL compatibility
  1649. using traits_type = std::char_traits<char_type>;
  1650. using value_type = char_type;
  1651. using pointer = char_type *;
  1652. using const_pointer = char_type const *;
  1653. using reference = char_type &;
  1654. using const_reference = char_type const &;
  1655. using const_iterator = const_pointer;
  1656. using iterator = pointer;
  1657. using const_reverse_iterator = reversed_iterator_for<char_type const>;
  1658. using reverse_iterator = reversed_iterator_for<char_type>;
  1659. using size_type = std::size_t;
  1660. using difference_type = std::ptrdiff_t;
  1661. // Non-STL type definitions
  1662. using allocator_type = allocator_type_;
  1663. using string_span = basic_string_slice<char_type>;
  1664. using string_view = basic_string_slice<typename std::add_const<char_type>::type>;
  1665. using partition_type = string_partition_result<string_view>;
  1666. /** @brief Special value for missing matches.
  1667. *
  1668. * We take the largest 63-bit unsigned integer on 64-bit machines.
  1669. * We take the largest 31-bit unsigned integer on 32-bit machines.
  1670. */
  1671. static constexpr size_type npos = SZ_SSIZE_MAX;
  1672. #pragma region Constructors and STL Utilities
  1673. sz_constexpr_if_cpp20 basic_string() noexcept {
  1674. // ! Instead of relying on the `sz_string_init`, we have to reimplement it to support `constexpr`.
  1675. string_.internal.start = &string_.internal.chars[0];
  1676. string_.words[1] = 0;
  1677. string_.words[2] = 0;
  1678. string_.words[3] = 0;
  1679. }
  1680. ~basic_string() noexcept {
  1681. _with_alloc([&](sz_alloc_type &alloc) {
  1682. sz_string_free(&string_, &alloc);
  1683. return true;
  1684. });
  1685. }
  1686. basic_string(basic_string &&other) noexcept { move(other); }
  1687. basic_string &operator=(basic_string &&other) noexcept {
  1688. if (!is_internal()) {
  1689. _with_alloc([&](sz_alloc_type &alloc) {
  1690. sz_string_free(&string_, &alloc);
  1691. return true;
  1692. });
  1693. }
  1694. move(other);
  1695. return *this;
  1696. }
  1697. basic_string(basic_string const &other) noexcept(false) { init(other); }
  1698. basic_string &operator=(basic_string const &other) noexcept(false) { return assign(other); }
  1699. basic_string(string_view view) noexcept(false) { init(view); }
  1700. basic_string &operator=(string_view view) noexcept(false) { return assign(view); }
  1701. basic_string(const_pointer c_string) noexcept(false) : basic_string(string_view(c_string)) {}
  1702. basic_string(const_pointer c_string, size_type length) noexcept(false)
  1703. : basic_string(string_view(c_string, length)) {}
  1704. basic_string &operator=(const_pointer other) noexcept(false) { return assign(string_view(other)); }
  1705. basic_string(std::nullptr_t) = delete;
  1706. /** @brief Construct a string by repeating a certain ::character ::count times. */
  1707. basic_string(size_type count, value_type character) noexcept(false) { init(count, character); }
  1708. basic_string(basic_string const &other, size_type pos) noexcept(false) { init(string_view(other).substr(pos)); }
  1709. basic_string(basic_string const &other, size_type pos, size_type count) noexcept(false) {
  1710. init(string_view(other).substr(pos, count));
  1711. }
  1712. basic_string(std::initializer_list<value_type> list) noexcept(false) {
  1713. init(string_view(list.begin(), list.size()));
  1714. }
  1715. operator string_view() const noexcept { return view(); }
  1716. string_view view() const noexcept {
  1717. sz_ptr_t string_start;
  1718. sz_size_t string_length;
  1719. sz_string_range(&string_, &string_start, &string_length);
  1720. return {string_start, string_length};
  1721. }
  1722. operator string_span() noexcept { return span(); }
  1723. string_span span() noexcept {
  1724. sz_ptr_t string_start;
  1725. sz_size_t string_length;
  1726. sz_string_range(&string_, &string_start, &string_length);
  1727. return {string_start, string_length};
  1728. }
  1729. /** @brief Exchanges the string contents witt the `other` string. */
  1730. void swap(basic_string &other) noexcept {
  1731. // If at least one of the strings is on the stack, a basic `std::swap(string_, other.string_)` won't work,
  1732. // as the pointer to the stack-allocated memory will be swapped, instead of the contents.
  1733. sz_ptr_t first_start, second_start;
  1734. sz_size_t first_length, second_length;
  1735. sz_size_t first_space, second_space;
  1736. sz_bool_t first_is_external, second_is_external;
  1737. sz_string_unpack(&string_, &first_start, &first_length, &first_space, &first_is_external);
  1738. sz_string_unpack(&other.string_, &second_start, &second_length, &second_space, &second_is_external);
  1739. std::swap(string_, other.string_);
  1740. if (!first_is_external) other.string_.internal.start = &other.string_.internal.chars[0];
  1741. if (!second_is_external) string_.internal.start = &string_.internal.chars[0];
  1742. }
  1743. #if !SZ_AVOID_STL
  1744. basic_string(std::string const &other) noexcept(false) : basic_string(other.data(), other.size()) {}
  1745. basic_string &operator=(std::string const &other) noexcept(false) { return assign({other.data(), other.size()}); }
  1746. // As we are need both `data()` and `size()`, going through `operator string_view()`
  1747. // and `sz_string_unpack` is faster than separate invocations.
  1748. operator std::string() const { return view(); }
  1749. /**
  1750. * @brief Formatted output function for compatibility with STL's `std::basic_ostream`.
  1751. * @throw `std::ios_base::failure` if an exception occurred during output.
  1752. */
  1753. template <typename stream_traits>
  1754. friend std::basic_ostream<value_type, stream_traits> &operator<<(std::basic_ostream<value_type, stream_traits> &os,
  1755. basic_string const &str) noexcept(false) {
  1756. return os.write(str.data(), str.size());
  1757. }
  1758. #if SZ_DETECT_CPP_17 && __cpp_lib_string_view
  1759. basic_string(std::string_view other) noexcept(false) : basic_string(other.data(), other.size()) {}
  1760. basic_string &operator=(std::string_view other) noexcept(false) { return assign({other.data(), other.size()}); }
  1761. operator std::string_view() const noexcept { return view(); }
  1762. #endif
  1763. #endif
  1764. template <typename first_type, typename second_type>
  1765. explicit basic_string(concatenation<first_type, second_type> const &expression) noexcept(false) {
  1766. _with_alloc([&](sz_alloc_type &alloc) {
  1767. sz_ptr_t ptr = sz_string_init_length(&string_, expression.length(), &alloc);
  1768. if (!ptr) return false;
  1769. expression.copy(ptr);
  1770. return true;
  1771. });
  1772. }
  1773. template <typename first_type, typename second_type>
  1774. basic_string &operator=(concatenation<first_type, second_type> const &expression) noexcept(false) {
  1775. if (!try_assign(expression)) throw std::bad_alloc();
  1776. return *this;
  1777. }
  1778. #pragma endregion
  1779. #pragma region Iterators and Accessors
  1780. iterator begin() noexcept { return iterator(data()); }
  1781. const_iterator begin() const noexcept { return const_iterator(data()); }
  1782. const_iterator cbegin() const noexcept { return const_iterator(data()); }
  1783. // As we are need both `data()` and `size()`, going through `operator string_view()`
  1784. // and `sz_string_unpack` is faster than separate invocations.
  1785. iterator end() noexcept { return span().end(); }
  1786. const_iterator end() const noexcept { return view().end(); }
  1787. const_iterator cend() const noexcept { return view().end(); }
  1788. reverse_iterator rbegin() noexcept { return span().rbegin(); }
  1789. const_reverse_iterator rbegin() const noexcept { return view().rbegin(); }
  1790. const_reverse_iterator crbegin() const noexcept { return view().crbegin(); }
  1791. reverse_iterator rend() noexcept { return span().rend(); }
  1792. const_reverse_iterator rend() const noexcept { return view().rend(); }
  1793. const_reverse_iterator crend() const noexcept { return view().crend(); }
  1794. reference operator[](size_type pos) noexcept { return string_.internal.start[pos]; }
  1795. const_reference operator[](size_type pos) const noexcept { return string_.internal.start[pos]; }
  1796. reference front() noexcept { return string_.internal.start[0]; }
  1797. const_reference front() const noexcept { return string_.internal.start[0]; }
  1798. reference back() noexcept { return string_.internal.start[size() - 1]; }
  1799. const_reference back() const noexcept { return string_.internal.start[size() - 1]; }
  1800. pointer data() noexcept { return string_.internal.start; }
  1801. const_pointer data() const noexcept { return string_.internal.start; }
  1802. pointer c_str() noexcept { return string_.internal.start; }
  1803. const_pointer c_str() const noexcept { return string_.internal.start; }
  1804. reference at(size_type pos) noexcept(false) {
  1805. if (pos >= size()) throw std::out_of_range("sz::basic_string::at");
  1806. return string_.internal.start[pos];
  1807. }
  1808. const_reference at(size_type pos) const noexcept(false) {
  1809. if (pos >= size()) throw std::out_of_range("sz::basic_string::at");
  1810. return string_.internal.start[pos];
  1811. }
  1812. difference_type ssize() const noexcept { return static_cast<difference_type>(size()); }
  1813. size_type size() const noexcept { return view().size(); }
  1814. size_type length() const noexcept { return size(); }
  1815. size_type max_size() const noexcept { return npos - 1; }
  1816. bool empty() const noexcept { return string_.external.length == 0; }
  1817. size_type capacity() const noexcept {
  1818. sz_ptr_t string_start;
  1819. sz_size_t string_length;
  1820. sz_size_t string_space;
  1821. sz_bool_t string_is_external;
  1822. sz_string_unpack(&string_, &string_start, &string_length, &string_space, &string_is_external);
  1823. return string_space - 1;
  1824. }
  1825. allocator_type get_allocator() const noexcept { return {}; }
  1826. #pragma endregion
  1827. #pragma region Slicing
  1828. #pragma region Safe and Signed Extensions
  1829. /**
  1830. * @brief Equivalent to Python's `"abc"[-3:-1]`. Exception-safe, unlike STL's `substr`.
  1831. * Supports signed and unsigned intervals.
  1832. */
  1833. string_view operator[](std::initializer_list<difference_type> offsets) const noexcept { return view()[offsets]; }
  1834. string_span operator[](std::initializer_list<difference_type> offsets) noexcept { return span()[offsets]; }
  1835. /**
  1836. * @brief Signed alternative to `at()`. Handy if you often write `str[str.size() - 2]`.
  1837. * @warning The behavior is @b undefined if the position is beyond bounds.
  1838. */
  1839. value_type sat(difference_type offset) const noexcept { return view().sat(offset); }
  1840. reference sat(difference_type offset) noexcept { return span().sat(offset); }
  1841. /**
  1842. * @brief The opposite operation to `remove_prefix`, that does no bounds checking.
  1843. * @warning The behavior is @b undefined if `n > size()`.
  1844. */
  1845. string_view front(size_type n) const noexcept { return view().front(n); }
  1846. string_span front(size_type n) noexcept { return span().front(n); }
  1847. /**
  1848. * @brief The opposite operation to `remove_prefix`, that does no bounds checking.
  1849. * @warning The behavior is @b undefined if `n > size()`.
  1850. */
  1851. string_view back(size_type n) const noexcept { return view().back(n); }
  1852. string_span back(size_type n) noexcept { return span().back(n); }
  1853. /**
  1854. * @brief Equivalent to Python's `"abc"[-3:-1]`. Exception-safe, unlike STL's `substr`.
  1855. * Supports signed and unsigned intervals. @b Doesn't copy or allocate memory!
  1856. */
  1857. string_view sub(difference_type start, difference_type end = npos) const noexcept { return view().sub(start, end); }
  1858. string_span sub(difference_type start, difference_type end = npos) noexcept { return span().sub(start, end); }
  1859. /**
  1860. * @brief Exports this entire view. Not an STL function, but useful for concatenations.
  1861. * The STL variant expects at least two arguments.
  1862. */
  1863. size_type copy(value_type *destination) const noexcept { return view().copy(destination); }
  1864. #pragma endregion
  1865. #pragma region STL Style
  1866. /**
  1867. * @brief Removes the first `n` characters from the view.
  1868. * @warning The behavior is @b undefined if `n > size()`.
  1869. */
  1870. void remove_prefix(size_type n) noexcept {
  1871. assert(n <= size());
  1872. sz_string_erase(&string_, 0, n);
  1873. }
  1874. /**
  1875. * @brief Removes the last `n` characters from the view.
  1876. * @warning The behavior is @b undefined if `n > size()`.
  1877. */
  1878. void remove_suffix(size_type n) noexcept {
  1879. assert(n <= size());
  1880. sz_string_erase(&string_, size() - n, n);
  1881. }
  1882. /** @brief Added for STL compatibility. */
  1883. basic_string substr() const noexcept { return *this; }
  1884. /**
  1885. * @brief Return a slice of this view after first `skip` bytes.
  1886. * @throws `std::out_of_range` if `skip > size()`.
  1887. * @see `sub` for a cleaner exception-less alternative.
  1888. */
  1889. basic_string substr(size_type skip) const noexcept(false) { return view().substr(skip); }
  1890. /**
  1891. * @brief Return a slice of this view after first `skip` bytes, taking at most `count` bytes.
  1892. * @throws `std::out_of_range` if `skip > size()`.
  1893. * @see `sub` for a cleaner exception-less alternative.
  1894. */
  1895. basic_string substr(size_type skip, size_type count) const noexcept(false) { return view().substr(skip, count); }
  1896. /**
  1897. * @brief Exports a slice of this view after first `skip` bytes, taking at most `count` bytes.
  1898. * @throws `std::out_of_range` if `skip > size()`.
  1899. * @see `sub` for a cleaner exception-less alternative.
  1900. */
  1901. size_type copy(value_type *destination, size_type count, size_type skip = 0) const noexcept(false) {
  1902. return view().copy(destination, count, skip);
  1903. }
  1904. #pragma endregion
  1905. #pragma endregion
  1906. #pragma region Comparisons
  1907. #pragma region Whole String Comparisons
  1908. /**
  1909. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1910. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1911. */
  1912. int compare(string_view other) const noexcept { return view().compare(other); }
  1913. /**
  1914. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1915. * Equivalent to `substr(pos1, count1).compare(other)`.
  1916. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1917. * @throw `std::out_of_range` if `pos1 > size()`.
  1918. */
  1919. int compare(size_type pos1, size_type count1, string_view other) const noexcept(false) {
  1920. return view().compare(pos1, count1, other);
  1921. }
  1922. /**
  1923. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1924. * Equivalent to `substr(pos1, count1).compare(other.substr(pos2, count2))`.
  1925. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1926. * @throw `std::out_of_range` if `pos1 > size()` or if `pos2 > other.size()`.
  1927. */
  1928. int compare(size_type pos1, size_type count1, string_view other, size_type pos2, size_type count2) const
  1929. noexcept(false) {
  1930. return view().compare(pos1, count1, other, pos2, count2);
  1931. }
  1932. /**
  1933. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1934. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1935. */
  1936. int compare(const_pointer other) const noexcept { return view().compare(other); }
  1937. /**
  1938. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1939. * Equivalent to substr(pos1, count1).compare(other).
  1940. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1941. * @throw `std::out_of_range` if `pos1 > size()`.
  1942. */
  1943. int compare(size_type pos1, size_type count1, const_pointer other) const noexcept(false) {
  1944. return view().compare(pos1, count1, other);
  1945. }
  1946. /**
  1947. * @brief Compares two strings lexicographically. If prefix matches, lengths are compared.
  1948. * Equivalent to `substr(pos1, count1).compare({s, count2})`.
  1949. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`.
  1950. * @throw `std::out_of_range` if `pos1 > size()`.
  1951. */
  1952. int compare(size_type pos1, size_type count1, const_pointer other, size_type count2) const noexcept(false) {
  1953. return view().compare(pos1, count1, other, count2);
  1954. }
  1955. /** @brief Checks if the string is equal to the other string. */
  1956. bool operator==(basic_string const &other) const noexcept { return view() == other.view(); }
  1957. bool operator==(string_view other) const noexcept { return view() == other; }
  1958. bool operator==(const_pointer other) const noexcept { return view() == string_view(other); }
  1959. #if SZ_DETECT_CPP20
  1960. /** @brief Computes the lexicographic ordering between this and the ::other string. */
  1961. std::strong_ordering operator<=>(basic_string const &other) const noexcept { return view() <=> other.view(); }
  1962. std::strong_ordering operator<=>(string_view other) const noexcept { return view() <=> other; }
  1963. std::strong_ordering operator<=>(const_pointer other) const noexcept { return view() <=> string_view(other); }
  1964. #else
  1965. /** @brief Checks if the string is not equal to the other string. */
  1966. bool operator!=(string_view other) const noexcept { return !operator==(other); }
  1967. /** @brief Checks if the string is lexicographically smaller than the other string. */
  1968. bool operator<(string_view other) const noexcept { return compare(other) == sz_less_k; }
  1969. /** @brief Checks if the string is lexicographically equal or smaller than the other string. */
  1970. bool operator<=(string_view other) const noexcept { return compare(other) != sz_greater_k; }
  1971. /** @brief Checks if the string is lexicographically greater than the other string. */
  1972. bool operator>(string_view other) const noexcept { return compare(other) == sz_greater_k; }
  1973. /** @brief Checks if the string is lexicographically equal or greater than the other string. */
  1974. bool operator>=(string_view other) const noexcept { return compare(other) != sz_less_k; }
  1975. #endif
  1976. #pragma endregion
  1977. #pragma region Prefix and Suffix Comparisons
  1978. /** @brief Checks if the string starts with the other string. */
  1979. bool starts_with(string_view other) const noexcept { return view().starts_with(other); }
  1980. /** @brief Checks if the string starts with the other string. */
  1981. bool starts_with(const_pointer other) const noexcept { return view().starts_with(other); }
  1982. /** @brief Checks if the string starts with the other character. */
  1983. bool starts_with(value_type other) const noexcept { return view().starts_with(other); }
  1984. /** @brief Checks if the string ends with the other string. */
  1985. bool ends_with(string_view other) const noexcept { return view().ends_with(other); }
  1986. /** @brief Checks if the string ends with the other string. */
  1987. bool ends_with(const_pointer other) const noexcept { return view().ends_with(other); }
  1988. /** @brief Checks if the string ends with the other character. */
  1989. bool ends_with(value_type other) const noexcept { return view().ends_with(other); }
  1990. #pragma endregion
  1991. #pragma endregion
  1992. #pragma region Matching Substrings
  1993. bool contains(string_view other) const noexcept { return view().contains(other); }
  1994. bool contains(value_type character) const noexcept { return view().contains(character); }
  1995. bool contains(const_pointer other) const noexcept { return view().contains(other); }
  1996. #pragma region Returning offsets
  1997. /**
  1998. * @brief Find the first occurrence of a substring, skipping the first `skip` characters.
  1999. * The behavior is @b undefined if `skip > size()`.
  2000. * @return The offset of the first character of the match, or `npos` if not found.
  2001. */
  2002. size_type find(string_view other, size_type skip = 0) const noexcept { return view().find(other, skip); }
  2003. /**
  2004. * @brief Find the first occurrence of a character, skipping the first `skip` characters.
  2005. * The behavior is @b undefined if `skip > size()`.
  2006. * @return The offset of the match, or `npos` if not found.
  2007. */
  2008. size_type find(value_type character, size_type skip = 0) const noexcept { return view().find(character, skip); }
  2009. /**
  2010. * @brief Find the first occurrence of a substring, skipping the first `skip` characters.
  2011. * The behavior is @b undefined if `skip > size()`.
  2012. * @return The offset of the first character of the match, or `npos` if not found.
  2013. */
  2014. size_type find(const_pointer other, size_type pos, size_type count) const noexcept {
  2015. return view().find(other, pos, count);
  2016. }
  2017. /**
  2018. * @brief Find the last occurrence of a substring.
  2019. * @return The offset of the first character of the match, or `npos` if not found.
  2020. */
  2021. size_type rfind(string_view other) const noexcept { return view().rfind(other); }
  2022. /**
  2023. * @brief Find the last occurrence of a substring, within first `until` characters.
  2024. * @return The offset of the first character of the match, or `npos` if not found.
  2025. */
  2026. size_type rfind(string_view other, size_type until) const noexcept { return view().rfind(other, until); }
  2027. /**
  2028. * @brief Find the last occurrence of a character.
  2029. * @return The offset of the match, or `npos` if not found.
  2030. */
  2031. size_type rfind(value_type character) const noexcept { return view().rfind(character); }
  2032. /**
  2033. * @brief Find the last occurrence of a character, within first `until` characters.
  2034. * @return The offset of the match, or `npos` if not found.
  2035. */
  2036. size_type rfind(value_type character, size_type until) const noexcept { return view().rfind(character, until); }
  2037. /**
  2038. * @brief Find the last occurrence of a substring, within first `until` characters.
  2039. * @return The offset of the first character of the match, or `npos` if not found.
  2040. */
  2041. size_type rfind(const_pointer other, size_type until, size_type count) const noexcept {
  2042. return view().rfind(other, until, count);
  2043. }
  2044. /** @brief Find the first occurrence of a character from a set. */
  2045. size_type find(char_set set) const noexcept { return view().find(set); }
  2046. /** @brief Find the last occurrence of a character from a set. */
  2047. size_type rfind(char_set set) const noexcept { return view().rfind(set); }
  2048. #pragma endregion
  2049. #pragma endregion
  2050. #pragma region Matching Character Sets
  2051. bool contains_only(char_set set) const noexcept { return find_first_not_of(set) == npos; }
  2052. bool is_alpha() const noexcept { return !empty() && contains_only(ascii_letters_set()); }
  2053. bool is_alnum() const noexcept { return !empty() && contains_only(ascii_letters_set() | digits_set()); }
  2054. bool is_ascii() const noexcept { return empty() || contains_only(ascii_controls_set() | ascii_printables_set()); }
  2055. bool is_digit() const noexcept { return !empty() && contains_only(digits_set()); }
  2056. bool is_lower() const noexcept { return !empty() && contains_only(ascii_lowercase_set()); }
  2057. bool is_space() const noexcept { return !empty() && contains_only(whitespaces_set()); }
  2058. bool is_upper() const noexcept { return !empty() && contains_only(ascii_uppercase_set()); }
  2059. bool is_printable() const noexcept { return empty() || contains_only(ascii_printables_set()); }
  2060. #pragma region Character Set Arguments
  2061. /**
  2062. * @brief Find the first occurrence of a character from a set.
  2063. * @param skip Number of characters to skip before the search.
  2064. * @warning The behavior is @b undefined if `skip > size()`.
  2065. */
  2066. size_type find_first_of(char_set set, size_type skip = 0) const noexcept { return view().find_first_of(set, skip); }
  2067. /**
  2068. * @brief Find the first occurrence of a character outside a set.
  2069. * @param skip The number of first characters to be skipped.
  2070. * @warning The behavior is @b undefined if `skip > size()`.
  2071. */
  2072. size_type find_first_not_of(char_set set, size_type skip = 0) const noexcept {
  2073. return view().find_first_not_of(set, skip);
  2074. }
  2075. /**
  2076. * @brief Find the last occurrence of a character from a set.
  2077. */
  2078. size_type find_last_of(char_set set) const noexcept { return view().find_last_of(set); }
  2079. /**
  2080. * @brief Find the last occurrence of a character outside a set.
  2081. */
  2082. size_type find_last_not_of(char_set set) const noexcept { return view().find_last_not_of(set); }
  2083. /**
  2084. * @brief Find the last occurrence of a character from a set.
  2085. * @param until The offset of the last character to be considered.
  2086. */
  2087. size_type find_last_of(char_set set, size_type until) const noexcept { return view().find_last_of(set, until); }
  2088. /**
  2089. * @brief Find the last occurrence of a character outside a set.
  2090. * @param until The offset of the last character to be considered.
  2091. */
  2092. size_type find_last_not_of(char_set set, size_type until) const noexcept {
  2093. return view().find_last_not_of(set, until);
  2094. }
  2095. #pragma endregion
  2096. #pragma region String Arguments
  2097. /**
  2098. * @brief Find the first occurrence of a character from a ::set.
  2099. * @param skip The number of first characters to be skipped.
  2100. */
  2101. size_type find_first_of(string_view other, size_type skip = 0) const noexcept {
  2102. return view().find_first_of(other, skip);
  2103. }
  2104. /**
  2105. * @brief Find the first occurrence of a character outside a ::set.
  2106. * @param skip The number of first characters to be skipped.
  2107. */
  2108. size_type find_first_not_of(string_view other, size_type skip = 0) const noexcept {
  2109. return view().find_first_not_of(other, skip);
  2110. }
  2111. /**
  2112. * @brief Find the last occurrence of a character from a ::set.
  2113. * @param until The offset of the last character to be considered.
  2114. */
  2115. size_type find_last_of(string_view other, size_type until = npos) const noexcept {
  2116. return view().find_last_of(other, until);
  2117. }
  2118. /**
  2119. * @brief Find the last occurrence of a character outside a ::set.
  2120. * @param until The offset of the last character to be considered.
  2121. */
  2122. size_type find_last_not_of(string_view other, size_type until = npos) const noexcept {
  2123. return view().find_last_not_of(other, until);
  2124. }
  2125. #pragma endregion
  2126. #pragma region C-Style Arguments
  2127. /**
  2128. * @brief Find the first occurrence of a character from a set.
  2129. * @param skip The number of first characters to be skipped.
  2130. * @warning The behavior is @b undefined if `skip > size()`.
  2131. */
  2132. size_type find_first_of(const_pointer other, size_type skip, size_type count) const noexcept {
  2133. return view().find_first_of(other, skip, count);
  2134. }
  2135. /**
  2136. * @brief Find the first occurrence of a character outside a set.
  2137. * @param skip The number of first characters to be skipped.
  2138. * @warning The behavior is @b undefined if `skip > size()`.
  2139. */
  2140. size_type find_first_not_of(const_pointer other, size_type skip, size_type count) const noexcept {
  2141. return view().find_first_not_of(other, skip, count);
  2142. }
  2143. /**
  2144. * @brief Find the last occurrence of a character from a set.
  2145. * @param until The number of first characters to be considered.
  2146. */
  2147. size_type find_last_of(const_pointer other, size_type until, size_type count) const noexcept {
  2148. return view().find_last_of(other, until, count);
  2149. }
  2150. /**
  2151. * @brief Find the last occurrence of a character outside a set.
  2152. * @param until The number of first characters to be considered.
  2153. */
  2154. size_type find_last_not_of(const_pointer other, size_type until, size_type count) const noexcept {
  2155. return view().find_last_not_of(other, until, count);
  2156. }
  2157. #pragma endregion
  2158. #pragma region Slicing
  2159. /**
  2160. * @brief Python-like convenience function, dropping prefix formed of given characters.
  2161. * Similar to `boost::algorithm::trim_left_if(str, is_any_of(set))`.
  2162. */
  2163. basic_string &lstrip(char_set set) noexcept {
  2164. auto remaining = view().lstrip(set);
  2165. remove_prefix(size() - remaining.size());
  2166. return *this;
  2167. }
  2168. /**
  2169. * @brief Python-like convenience function, dropping suffix formed of given characters.
  2170. * Similar to `boost::algorithm::trim_right_if(str, is_any_of(set))`.
  2171. */
  2172. basic_string &rstrip(char_set set) noexcept {
  2173. auto remaining = view().rstrip(set);
  2174. remove_suffix(size() - remaining.size());
  2175. return *this;
  2176. }
  2177. /**
  2178. * @brief Python-like convenience function, dropping both the prefix & the suffix formed of given characters.
  2179. * Similar to `boost::algorithm::trim_if(str, is_any_of(set))`.
  2180. */
  2181. basic_string &strip(char_set set) noexcept { return lstrip(set).rstrip(set); }
  2182. #pragma endregion
  2183. #pragma endregion
  2184. #pragma region Modifiers
  2185. #pragma region Non-STL API
  2186. bool try_resize(size_type count, value_type character = '\0') noexcept;
  2187. bool try_reserve(size_type capacity) noexcept {
  2188. return _with_alloc([&](sz_alloc_type &alloc) { return sz_string_reserve(&string_, capacity, &alloc); });
  2189. }
  2190. bool try_assign(string_view other) noexcept;
  2191. template <typename first_type, typename second_type>
  2192. bool try_assign(concatenation<first_type, second_type> const &other) noexcept;
  2193. bool try_push_back(char_type c) noexcept;
  2194. bool try_append(const_pointer str, size_type length) noexcept;
  2195. bool try_append(string_view str) noexcept { return try_append(str.data(), str.size()); }
  2196. /**
  2197. * @brief Erases ( @b in-place ) a range of characters defined with signed offsets.
  2198. * @return Number of characters removed.
  2199. */
  2200. size_type try_erase(difference_type signed_start_offset = 0, difference_type signed_end_offset = npos) noexcept {
  2201. sz_size_t normalized_offset, normalized_length;
  2202. sz_ssize_clamp_interval(size(), signed_start_offset, signed_end_offset, &normalized_offset, &normalized_length);
  2203. if (!normalized_length) return false;
  2204. sz_string_erase(&string_, normalized_offset, normalized_length);
  2205. return normalized_length;
  2206. }
  2207. /**
  2208. * @brief Inserts ( @b in-place ) a range of characters at a given signed offset.
  2209. * @return `true` if the insertion was successful, `false` otherwise.
  2210. */
  2211. bool try_insert(difference_type signed_offset, string_view string) noexcept {
  2212. sz_size_t normalized_offset, normalized_length;
  2213. sz_ssize_clamp_interval(size(), signed_offset, 0, &normalized_offset, &normalized_length);
  2214. if (!_with_alloc([&](sz_alloc_type &alloc) {
  2215. return sz_string_expand(&string_, normalized_offset, string.size(), &alloc);
  2216. }))
  2217. return false;
  2218. sz_copy(data() + normalized_offset, string.data(), string.size());
  2219. return true;
  2220. }
  2221. /**
  2222. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2223. * @return `true` if the replacement was successful, `false` otherwise.
  2224. */
  2225. bool try_replace(difference_type signed_start_offset, difference_type signed_end_offset,
  2226. string_view replacement) noexcept {
  2227. sz_size_t normalized_offset, normalized_length;
  2228. sz_ssize_clamp_interval(size(), signed_start_offset, signed_end_offset, &normalized_offset, &normalized_length);
  2229. if (!try_preparing_replacement(normalized_offset, normalized_length, replacement)) return false;
  2230. sz_copy(data() + normalized_offset, replacement.data(), replacement.size());
  2231. return true;
  2232. }
  2233. #pragma endregion
  2234. #pragma region STL Interfaces
  2235. /**
  2236. * @brief Clears the string contents, but @b no deallocations happen.
  2237. */
  2238. void clear() noexcept { sz_string_erase(&string_, 0, SZ_SIZE_MAX); }
  2239. /**
  2240. * @brief Resizes the string to the given size, filling the new space with the given character,
  2241. * or NULL-character if nothing is provided.
  2242. * @throw `std::length_error` if the string is too long.
  2243. * @throw `std::bad_alloc` if the allocation fails.
  2244. */
  2245. void resize(size_type count, value_type character = '\0') noexcept(false) {
  2246. if (count > max_size()) throw std::length_error("sz::basic_string::resize");
  2247. if (!try_resize(count, character)) throw std::bad_alloc();
  2248. }
  2249. /**
  2250. * @brief Informs the string object of a planned change in size, so that it pre-allocate once.
  2251. * @throw `std::length_error` if the string is too long.
  2252. */
  2253. void reserve(size_type capacity) noexcept(false) {
  2254. if (capacity > max_size()) throw std::length_error("sz::basic_string::reserve");
  2255. if (!try_reserve(capacity)) throw std::bad_alloc();
  2256. }
  2257. /**
  2258. * @brief Inserts ( @b in-place ) a ::character multiple times at the given offset.
  2259. * @throw `std::out_of_range` if `offset > size()`.
  2260. * @throw `std::length_error` if the string is too long.
  2261. * @throw `std::bad_alloc` if the allocation fails.
  2262. */
  2263. basic_string &insert(size_type offset, size_type repeats, char_type character) noexcept(false) {
  2264. if (offset > size()) throw std::out_of_range("sz::basic_string::insert");
  2265. if (size() + repeats > max_size()) throw std::length_error("sz::basic_string::insert");
  2266. if (!_with_alloc([&](sz_alloc_type &alloc) { return sz_string_expand(&string_, offset, repeats, &alloc); }))
  2267. throw std::bad_alloc();
  2268. sz_fill(data() + offset, repeats, character);
  2269. return *this;
  2270. }
  2271. /**
  2272. * @brief Inserts ( @b in-place ) a range of characters at the given offset.
  2273. * @throw `std::out_of_range` if `offset > size()`.
  2274. * @throw `std::length_error` if the string is too long.
  2275. * @throw `std::bad_alloc` if the allocation fails.
  2276. */
  2277. basic_string &insert(size_type offset, string_view other) noexcept(false) {
  2278. if (offset > size()) throw std::out_of_range("sz::basic_string::insert");
  2279. if (size() + other.size() > max_size()) throw std::length_error("sz::basic_string::insert");
  2280. if (!_with_alloc(
  2281. [&](sz_alloc_type &alloc) { return sz_string_expand(&string_, offset, other.size(), &alloc); }))
  2282. throw std::bad_alloc();
  2283. sz_copy(data() + offset, other.data(), other.size());
  2284. return *this;
  2285. }
  2286. /**
  2287. * @brief Inserts ( @b in-place ) a range of characters at the given offset.
  2288. * @throw `std::out_of_range` if `offset > size()`.
  2289. * @throw `std::length_error` if the string is too long.
  2290. * @throw `std::bad_alloc` if the allocation fails.
  2291. */
  2292. basic_string &insert(size_type offset, const_pointer start, size_type length) noexcept(false) {
  2293. return insert(offset, string_view(start, length));
  2294. }
  2295. /**
  2296. * @brief Inserts ( @b in-place ) a slice of another string at the given offset.
  2297. * @throw `std::out_of_range` if `offset > size()` or `other_index > other.size()`.
  2298. * @throw `std::length_error` if the string is too long.
  2299. * @throw `std::bad_alloc` if the allocation fails.
  2300. */
  2301. basic_string &insert(size_type offset, string_view other, size_type other_index,
  2302. size_type count = npos) noexcept(false) {
  2303. return insert(offset, other.substr(other_index, count));
  2304. }
  2305. /**
  2306. * @brief Inserts ( @b in-place ) one ::character at the given iterator position.
  2307. * @throw `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
  2308. * @throw `std::length_error` if the string is too long.
  2309. * @throw `std::bad_alloc` if the allocation fails.
  2310. */
  2311. iterator insert(const_iterator it, char_type character) noexcept(false) {
  2312. auto pos = range_length(cbegin(), it);
  2313. insert(pos, string_view(&character, 1));
  2314. return begin() + pos;
  2315. }
  2316. /**
  2317. * @brief Inserts ( @b in-place ) a ::character multiple times at the given iterator position.
  2318. * @throw `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
  2319. * @throw `std::length_error` if the string is too long.
  2320. * @throw `std::bad_alloc` if the allocation fails.
  2321. */
  2322. iterator insert(const_iterator it, size_type repeats, char_type character) noexcept(false) {
  2323. auto pos = range_length(cbegin(), it);
  2324. insert(pos, repeats, character);
  2325. return begin() + pos;
  2326. }
  2327. /**
  2328. * @brief Inserts ( @b in-place ) a range at the given iterator position.
  2329. * @throw `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
  2330. * @throw `std::length_error` if the string is too long.
  2331. * @throw `std::bad_alloc` if the allocation fails.
  2332. */
  2333. template <typename input_iterator>
  2334. iterator insert(const_iterator it, input_iterator first, input_iterator last) noexcept(false) {
  2335. auto pos = range_length(cbegin(), it);
  2336. if (pos > size()) throw std::out_of_range("sz::basic_string::insert");
  2337. auto added_length = range_length(first, last);
  2338. if (size() + added_length > max_size()) throw std::length_error("sz::basic_string::insert");
  2339. if (!_with_alloc([&](sz_alloc_type &alloc) { return sz_string_expand(&string_, pos, added_length, &alloc); }))
  2340. throw std::bad_alloc();
  2341. iterator result = begin() + pos;
  2342. for (iterator output = result; first != last; ++first, ++output) *output = *first;
  2343. return result;
  2344. }
  2345. /**
  2346. * @brief Inserts ( @b in-place ) an initializer list of characters.
  2347. * @throw `std::out_of_range` if `pos > size()` or `other_index > other.size()`.
  2348. * @throw `std::length_error` if the string is too long.
  2349. * @throw `std::bad_alloc` if the allocation fails.
  2350. */
  2351. iterator insert(const_iterator it, std::initializer_list<char_type> list) noexcept(false) {
  2352. return insert(it, list.begin(), list.end());
  2353. }
  2354. /**
  2355. * @brief Erases ( @b in-place ) the given range of characters.
  2356. * @throws `std::out_of_range` if `pos > size()`.
  2357. * @see `try_erase_slice` for a cleaner exception-less alternative.
  2358. */
  2359. basic_string &erase(size_type pos = 0, size_type count = npos) noexcept(false) {
  2360. if (!count || empty()) return *this;
  2361. if (pos >= size()) throw std::out_of_range("sz::basic_string::erase");
  2362. sz_string_erase(&string_, pos, count);
  2363. return *this;
  2364. }
  2365. /**
  2366. * @brief Erases ( @b in-place ) the given range of characters.
  2367. * @return Iterator pointing following the erased character, or end() if no such character exists.
  2368. */
  2369. iterator erase(const_iterator first, const_iterator last) noexcept {
  2370. auto start = begin();
  2371. auto offset = first - start;
  2372. sz_string_erase(&string_, offset, last - first);
  2373. return start + offset;
  2374. }
  2375. /**
  2376. * @brief Erases ( @b in-place ) the one character at a given postion.
  2377. * @return Iterator pointing following the erased character, or end() if no such character exists.
  2378. */
  2379. iterator erase(const_iterator pos) noexcept { return erase(pos, pos + 1); }
  2380. /**
  2381. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2382. * @throws `std::out_of_range` if `pos > size()`.
  2383. * @throws `std::length_error` if the string is too long.
  2384. * @see `try_replace` for a cleaner exception-less alternative.
  2385. */
  2386. basic_string &replace(size_type pos, size_type count, string_view const &str) noexcept(false) {
  2387. if (pos > size()) throw std::out_of_range("sz::basic_string::replace");
  2388. if (size() - count + str.size() > max_size()) throw std::length_error("sz::basic_string::replace");
  2389. if (!try_preparing_replacement(pos, count, str.size())) throw std::bad_alloc();
  2390. sz_copy(data() + pos, str.data(), str.size());
  2391. return *this;
  2392. }
  2393. /**
  2394. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2395. * @throws `std::out_of_range` if `pos > size()`.
  2396. * @throws `std::length_error` if the string is too long.
  2397. * @see `try_replace` for a cleaner exception-less alternative.
  2398. */
  2399. basic_string &replace(const_iterator first, const_iterator last, string_view const &str) noexcept(false) {
  2400. return replace(range_length(cbegin(), first), last - first, str);
  2401. }
  2402. /**
  2403. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2404. * @throws `std::out_of_range` if `pos > size()` or `pos2 > str.size()`.
  2405. * @throws `std::length_error` if the string is too long.
  2406. * @see `try_replace` for a cleaner exception-less alternative.
  2407. */
  2408. basic_string &replace(size_type pos, size_type count, string_view const &str, size_type pos2,
  2409. size_type count2 = npos) noexcept(false) {
  2410. return replace(pos, count, str.substr(pos2, count2));
  2411. }
  2412. /**
  2413. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2414. * @throws `std::out_of_range` if `pos > size()`.
  2415. * @throws `std::length_error` if the string is too long.
  2416. * @see `try_replace` for a cleaner exception-less alternative.
  2417. */
  2418. basic_string &replace(size_type pos, size_type count, const_pointer cstr, size_type count2) noexcept(false) {
  2419. return replace(pos, count, string_view(cstr, count2));
  2420. }
  2421. /**
  2422. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2423. * @throws `std::out_of_range` if `pos > size()`.
  2424. * @throws `std::length_error` if the string is too long.
  2425. * @see `try_replace` for a cleaner exception-less alternative.
  2426. */
  2427. basic_string &replace(const_iterator first, const_iterator last, const_pointer cstr,
  2428. size_type count2) noexcept(false) {
  2429. return replace(range_length(cbegin(), first), last - first, string_view(cstr, count2));
  2430. }
  2431. /**
  2432. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2433. * @throws `std::out_of_range` if `pos > size()`.
  2434. * @throws `std::length_error` if the string is too long.
  2435. * @see `try_replace` for a cleaner exception-less alternative.
  2436. */
  2437. basic_string &replace(size_type pos, size_type count, const_pointer cstr) noexcept(false) {
  2438. return replace(pos, count, string_view(cstr));
  2439. }
  2440. /**
  2441. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2442. * @throws `std::out_of_range` if `pos > size()`.
  2443. * @throws `std::length_error` if the string is too long.
  2444. * @see `try_replace` for a cleaner exception-less alternative.
  2445. */
  2446. basic_string &replace(const_iterator first, const_iterator last, const_pointer cstr) noexcept(false) {
  2447. return replace(range_length(cbegin(), first), last - first, string_view(cstr));
  2448. }
  2449. /**
  2450. * @brief Replaces ( @b in-place ) a range of characters with a repetition of given characters.
  2451. * @throws `std::out_of_range` if `pos > size()`.
  2452. * @throws `std::length_error` if the string is too long.
  2453. * @see `try_replace` for a cleaner exception-less alternative.
  2454. */
  2455. basic_string &replace(size_type pos, size_type count, size_type count2, char_type character) noexcept(false) {
  2456. if (pos > size()) throw std::out_of_range("sz::basic_string::replace");
  2457. if (size() - count + count2 > max_size()) throw std::length_error("sz::basic_string::replace");
  2458. if (!try_preparing_replacement(pos, count, count2)) throw std::bad_alloc();
  2459. sz_fill(data() + pos, count2, character);
  2460. return *this;
  2461. }
  2462. /**
  2463. * @brief Replaces ( @b in-place ) a range of characters with a repetition of given characters.
  2464. * @throws `std::out_of_range` if `pos > size()`.
  2465. * @throws `std::length_error` if the string is too long.
  2466. * @see `try_replace` for a cleaner exception-less alternative.
  2467. */
  2468. basic_string &replace(const_iterator first, const_iterator last, size_type count2,
  2469. char_type character) noexcept(false) {
  2470. return replace(range_length(cbegin(), first), last - first, count2, character);
  2471. }
  2472. /**
  2473. * @brief Replaces ( @b in-place ) a range of characters with a given string.
  2474. * @throws `std::out_of_range` if `pos > size()`.
  2475. * @throws `std::length_error` if the string is too long.
  2476. * @see `try_replace` for a cleaner exception-less alternative.
  2477. */
  2478. template <typename input_iterator>
  2479. basic_string &replace(const_iterator first, const_iterator last, input_iterator first2,
  2480. input_iterator last2) noexcept(false) {
  2481. auto pos = range_length(cbegin(), first);
  2482. auto count = range_length(first, last);
  2483. auto count2 = range_length(first2, last2);
  2484. if (pos > size()) throw std::out_of_range("sz::basic_string::replace");
  2485. if (size() - count + count2 > max_size()) throw std::length_error("sz::basic_string::replace");
  2486. if (!try_preparing_replacement(pos, count, count2)) throw std::bad_alloc();
  2487. for (iterator output = begin() + pos; first2 != last2; ++first2, ++output) *output = *first2;
  2488. return *this;
  2489. }
  2490. /**
  2491. * @brief Replaces ( @b in-place ) a range of characters with a given initializer list.
  2492. * @throws `std::out_of_range` if `pos > size()`.
  2493. * @throws `std::length_error` if the string is too long.
  2494. * @see `try_replace` for a cleaner exception-less alternative.
  2495. */
  2496. basic_string &replace(const_iterator first, const_iterator last,
  2497. std::initializer_list<char_type> list) noexcept(false) {
  2498. return replace(first, last, list.begin(), list.end());
  2499. }
  2500. /**
  2501. * @brief Appends the given character at the end.
  2502. * @throw `std::length_error` if the string is too long.
  2503. * @throw `std::bad_alloc` if the allocation fails.
  2504. */
  2505. void push_back(char_type ch) noexcept(false) {
  2506. if (size() == max_size()) throw std::length_error("string::push_back");
  2507. if (!try_push_back(ch)) throw std::bad_alloc();
  2508. }
  2509. /**
  2510. * @brief Removes the last character from the string.
  2511. * @warning The behavior is @b undefined if the string is empty.
  2512. */
  2513. void pop_back() noexcept { sz_string_erase(&string_, size() - 1, 1); }
  2514. /**
  2515. * @brief Overwrites the string with the given string.
  2516. * @throw `std::length_error` if the string is too long.
  2517. * @throw `std::bad_alloc` if the allocation fails.
  2518. * @see `try_assign` for a cleaner exception-less alternative.
  2519. */
  2520. basic_string &assign(string_view other) noexcept(false) {
  2521. if (!try_assign(other)) throw std::bad_alloc();
  2522. return *this;
  2523. }
  2524. /**
  2525. * @brief Overwrites the string with the given repeated character.
  2526. * @throw `std::length_error` if the string is too long.
  2527. * @throw `std::bad_alloc` if the allocation fails.
  2528. * @see `try_assign` for a cleaner exception-less alternative.
  2529. */
  2530. basic_string &assign(size_type repeats, char_type character) noexcept(false) {
  2531. resize(repeats, character);
  2532. sz_fill(data(), repeats, *(sz_u8_t *)&character);
  2533. return *this;
  2534. }
  2535. /**
  2536. * @brief Overwrites the string with the given string.
  2537. * @throw `std::length_error` if the string is too long.
  2538. * @throw `std::bad_alloc` if the allocation fails.
  2539. * @see `try_assign` for a cleaner exception-less alternative.
  2540. */
  2541. basic_string &assign(const_pointer other, size_type length) noexcept(false) { return assign({other, length}); }
  2542. /**
  2543. * @brief Overwrites the string with the given string.
  2544. * @throw `std::length_error` if the string is too long or `pos > str.size()`.
  2545. * @throw `std::bad_alloc` if the allocation fails.
  2546. * @see `try_assign` for a cleaner exception-less alternative.
  2547. */
  2548. basic_string &assign(string_view str, size_type pos, size_type count = npos) noexcept(false) {
  2549. return assign(str.substr(pos, count));
  2550. }
  2551. /**
  2552. * @brief Overwrites the string with the given iterator range.
  2553. * @throw `std::length_error` if the string is too long.
  2554. * @throw `std::bad_alloc` if the allocation fails.
  2555. * @see `try_assign` for a cleaner exception-less alternative.
  2556. */
  2557. template <typename input_iterator>
  2558. basic_string &assign(input_iterator first, input_iterator last) noexcept(false) {
  2559. resize(range_length(first, last));
  2560. for (iterator output = begin(); first != last; ++first, ++output) *output = *first;
  2561. return *this;
  2562. }
  2563. /**
  2564. * @brief Overwrites the string with the given initializer list.
  2565. * @throw `std::length_error` if the string is too long.
  2566. * @throw `std::bad_alloc` if the allocation fails.
  2567. * @see `try_assign` for a cleaner exception-less alternative.
  2568. */
  2569. basic_string &assign(std::initializer_list<char_type> list) noexcept(false) {
  2570. return assign(list.begin(), list.end());
  2571. }
  2572. /**
  2573. * @brief Appends to the end of the current string.
  2574. * @throw `std::length_error` if the string is too long.
  2575. * @throw `std::bad_alloc` if the allocation fails.
  2576. * @see `try_append` for a cleaner exception-less alternative.
  2577. */
  2578. basic_string &append(string_view str) noexcept(false) {
  2579. if (!try_append(str)) throw std::bad_alloc();
  2580. return *this;
  2581. }
  2582. /**
  2583. * @brief Appends to the end of the current string.
  2584. * @throw `std::length_error` if the string is too long or `pos > str.size()`.
  2585. * @throw `std::bad_alloc` if the allocation fails.
  2586. * @see `try_append` for a cleaner exception-less alternative.
  2587. */
  2588. basic_string &append(string_view str, size_type pos, size_type length = npos) noexcept(false) {
  2589. return append(str.substr(pos, length));
  2590. }
  2591. /**
  2592. * @brief Appends to the end of the current string.
  2593. * @throw `std::length_error` if the string is too long.
  2594. * @throw `std::bad_alloc` if the allocation fails.
  2595. * @see `try_append` for a cleaner exception-less alternative.
  2596. */
  2597. basic_string &append(const_pointer str, size_type length) noexcept(false) { return append({str, length}); }
  2598. /**
  2599. * @brief Appends to the end of the current string.
  2600. * @throw `std::length_error` if the string is too long.
  2601. * @throw `std::bad_alloc` if the allocation fails.
  2602. * @see `try_append` for a cleaner exception-less alternative.
  2603. */
  2604. basic_string &append(const_pointer str) noexcept(false) { return append(string_view(str)); }
  2605. /**
  2606. * @brief Appends a repeated character to the end of the current string.
  2607. * @throw `std::length_error` if the string is too long.
  2608. * @throw `std::bad_alloc` if the allocation fails.
  2609. * @see `try_append` for a cleaner exception-less alternative.
  2610. */
  2611. basic_string &append(size_type repeats, char_type ch) noexcept(false) {
  2612. resize(size() + repeats, ch);
  2613. return *this;
  2614. }
  2615. /**
  2616. * @brief Appends to the end of the current string.
  2617. * @throw `std::length_error` if the string is too long.
  2618. * @throw `std::bad_alloc` if the allocation fails.
  2619. * @see `try_append` for a cleaner exception-less alternative.
  2620. */
  2621. basic_string &append(std::initializer_list<char_type> other) noexcept(false) {
  2622. return append(other.begin(), other.end());
  2623. }
  2624. /**
  2625. * @brief Appends to the end of the current string.
  2626. * @throw `std::length_error` if the string is too long.
  2627. * @throw `std::bad_alloc` if the allocation fails.
  2628. * @see `try_append` for a cleaner exception-less alternative.
  2629. */
  2630. template <typename input_iterator>
  2631. basic_string &append(input_iterator first, input_iterator last) noexcept(false) {
  2632. insert<input_iterator>(cend(), first, last);
  2633. return *this;
  2634. }
  2635. basic_string &operator+=(string_view other) noexcept(false) { return append(other); }
  2636. basic_string &operator+=(std::initializer_list<char_type> other) noexcept(false) { return append(other); }
  2637. basic_string &operator+=(char_type character) noexcept(false) { return operator+=(string_view(&character, 1)); }
  2638. basic_string &operator+=(const_pointer other) noexcept(false) { return operator+=(string_view(other)); }
  2639. basic_string operator+(char_type character) const noexcept(false) { return operator+(string_view(&character, 1)); }
  2640. basic_string operator+(const_pointer other) const noexcept(false) { return operator+(string_view(other)); }
  2641. basic_string operator+(string_view other) const noexcept(false) {
  2642. return basic_string {concatenation<string_view, string_view> {view(), other}};
  2643. }
  2644. basic_string operator+(std::initializer_list<char_type> other) const noexcept(false) {
  2645. return basic_string {concatenation<string_view, string_view> {view(), other}};
  2646. }
  2647. #pragma endregion
  2648. #pragma endregion
  2649. concatenation<string_view, string_view> operator|(string_view other) const noexcept { return {view(), other}; }
  2650. size_type edit_distance(string_view other, size_type bound = 0) const noexcept {
  2651. size_type distance;
  2652. _with_alloc([&](sz_alloc_type &alloc) {
  2653. distance = sz_edit_distance(data(), size(), other.data(), other.size(), bound, &alloc);
  2654. return true;
  2655. });
  2656. return distance;
  2657. }
  2658. /** @brief Hashes the string, equivalent to `std::hash<string_view>{}(str)`. */
  2659. size_type hash() const noexcept { return view().hash(); }
  2660. /**
  2661. * @brief Overwrites the string with random characters from the given alphabet using the random generator.
  2662. *
  2663. * @param generator A random generator function object that returns a random number in the range [0, 2^64).
  2664. * @param alphabet A string of characters to choose from.
  2665. */
  2666. template <typename generator_type>
  2667. basic_string &randomize(generator_type &generator, string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
  2668. sz_ptr_t start;
  2669. sz_size_t length;
  2670. sz_string_range(&string_, &start, &length);
  2671. sz_random_generator_t generator_callback = &_call_random_generator<generator_type>;
  2672. sz_generate(alphabet.data(), alphabet.size(), start, length, generator_callback, &generator);
  2673. return *this;
  2674. }
  2675. /**
  2676. * @brief Overwrites the string with random characters from the given alphabet
  2677. * using `std::rand` as the random generator.
  2678. *
  2679. * @param alphabet A string of characters to choose from.
  2680. */
  2681. basic_string &randomize(string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
  2682. auto generator = []() { return static_cast<sz_u64_t>(std::rand()); };
  2683. return randomize(generator, alphabet);
  2684. }
  2685. /**
  2686. * @brief Generate a new random string of given length using `std::rand` as the random generator.
  2687. * May throw exceptions if the memory allocation fails.
  2688. *
  2689. * @param length The length of the generated string.
  2690. * @param alphabet A string of characters to choose from.
  2691. */
  2692. static basic_string random(size_type length, string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept(false) {
  2693. return basic_string(length, '\0').randomize(alphabet);
  2694. }
  2695. /**
  2696. * @brief Generate a new random string of given length using the provided random number generator.
  2697. * May throw exceptions if the memory allocation fails.
  2698. *
  2699. * @param generator A random generator function object that returns a random number in the range [0, 2^64).
  2700. * @param length The length of the generated string.
  2701. * @param alphabet A string of characters to choose from.
  2702. */
  2703. template <typename generator_type>
  2704. static basic_string random(generator_type &generator, size_type length,
  2705. string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept(false) {
  2706. return basic_string(length, '\0').randomize(generator, alphabet);
  2707. }
  2708. /**
  2709. * @brief Replaces ( @b in-place ) all occurrences of a given string with the ::replacement string.
  2710. * Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
  2711. *
  2712. * The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
  2713. * and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
  2714. * The algorithm is suboptimal when this string is made exclusively of the pattern.
  2715. */
  2716. basic_string &replace_all(string_view pattern, string_view replacement) noexcept(false) {
  2717. if (!try_replace_all(pattern, replacement)) throw std::bad_alloc();
  2718. return *this;
  2719. }
  2720. /**
  2721. * @brief Replaces ( @b in-place ) all occurrences of a given character set with the ::replacement string.
  2722. * Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
  2723. *
  2724. * The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
  2725. * and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
  2726. * The algorithm is suboptimal when this string is made exclusively of the pattern.
  2727. */
  2728. basic_string &replace_all(char_set pattern, string_view replacement) noexcept(false) {
  2729. if (!try_replace_all(pattern, replacement)) throw std::bad_alloc();
  2730. return *this;
  2731. }
  2732. /**
  2733. * @brief Replaces ( @b in-place ) all occurrences of a given string with the ::replacement string.
  2734. * Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
  2735. *
  2736. * The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
  2737. * and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
  2738. * The algorithm is suboptimal when this string is made exclusively of the pattern.
  2739. */
  2740. bool try_replace_all(string_view pattern, string_view replacement) noexcept {
  2741. return try_replace_all_<string_view>(pattern, replacement);
  2742. }
  2743. /**
  2744. * @brief Replaces ( @b in-place ) all occurrences of a given character set with the ::replacement string.
  2745. * Similar to `boost::algorithm::replace_all` and Python's `str.replace`.
  2746. *
  2747. * The implementation is not as composable, as using search ranges combined with a replacing mapping for matches,
  2748. * and might be suboptimal, if you are exporting the cleaned-up string to another buffer.
  2749. * The algorithm is suboptimal when this string is made exclusively of the pattern.
  2750. */
  2751. bool try_replace_all(char_set pattern, string_view replacement) noexcept {
  2752. return try_replace_all_<char_set>(pattern, replacement);
  2753. }
  2754. private:
  2755. template <typename pattern_type>
  2756. bool try_replace_all_(pattern_type pattern, string_view replacement) noexcept;
  2757. /**
  2758. * @brief Tries to prepare the string for a replacement of a given range with a new string.
  2759. * The allocation may occur, if the replacement is longer than the replaced range.
  2760. */
  2761. bool try_preparing_replacement(size_type offset, size_type length, size_type new_length) noexcept;
  2762. };
  2763. using string = basic_string<char, std::allocator<char>>;
  2764. static_assert(sizeof(string) == 4 * sizeof(void *), "String size must be 4 pointers.");
  2765. namespace literals {
  2766. constexpr string_view operator""_sz(char const *str, std::size_t length) noexcept { return {str, length}; }
  2767. } // namespace literals
  2768. template <typename char_type_, typename allocator_>
  2769. bool basic_string<char_type_, allocator_>::try_resize(size_type count, value_type character) noexcept {
  2770. sz_ptr_t string_start;
  2771. sz_size_t string_length;
  2772. sz_size_t string_space;
  2773. sz_bool_t string_is_external;
  2774. sz_string_unpack(&string_, &string_start, &string_length, &string_space, &string_is_external);
  2775. // Allocate more space if needed.
  2776. if (count >= string_space) {
  2777. if (!_with_alloc(
  2778. [&](sz_alloc_type &alloc) { return sz_string_expand(&string_, SZ_SIZE_MAX, count, &alloc) != NULL; }))
  2779. return false;
  2780. sz_string_unpack(&string_, &string_start, &string_length, &string_space, &string_is_external);
  2781. }
  2782. // Fill the trailing characters.
  2783. if (count > string_length) {
  2784. sz_fill(string_start + string_length, count - string_length, character);
  2785. string_start[count] = '\0';
  2786. // Knowing the layout of the string, we can perform this operation safely,
  2787. // even if its located on stack.
  2788. string_.external.length += count - string_length;
  2789. }
  2790. else { sz_string_erase(&string_, count, SZ_SIZE_MAX); }
  2791. return true;
  2792. }
  2793. template <typename char_type_, typename allocator_>
  2794. bool basic_string<char_type_, allocator_>::try_assign(string_view other) noexcept {
  2795. // We can't just assign the other string state, as its start address may be somewhere else on the stack.
  2796. sz_ptr_t string_start;
  2797. sz_size_t string_length;
  2798. sz_string_range(&string_, &string_start, &string_length);
  2799. if (string_length >= other.length()) {
  2800. other.copy(string_start, other.length());
  2801. sz_string_erase(&string_, other.length(), SZ_SIZE_MAX);
  2802. }
  2803. else {
  2804. if (!_with_alloc([&](sz_alloc_type &alloc) {
  2805. string_start = sz_string_expand(&string_, SZ_SIZE_MAX, other.length(), &alloc);
  2806. if (!string_start) return false;
  2807. other.copy(string_start, other.length());
  2808. return true;
  2809. }))
  2810. return false;
  2811. }
  2812. return true;
  2813. }
  2814. template <typename char_type_, typename allocator_>
  2815. bool basic_string<char_type_, allocator_>::try_push_back(char_type c) noexcept {
  2816. return _with_alloc([&](sz_alloc_type &alloc) {
  2817. auto old_size = size();
  2818. sz_ptr_t start = sz_string_expand(&string_, SZ_SIZE_MAX, 1, &alloc);
  2819. if (!start) return false;
  2820. start[old_size] = c;
  2821. return true;
  2822. });
  2823. }
  2824. template <typename char_type_, typename allocator_>
  2825. bool basic_string<char_type_, allocator_>::try_append(const_pointer str, size_type length) noexcept {
  2826. return _with_alloc([&](sz_alloc_type &alloc) {
  2827. auto old_size = size();
  2828. sz_ptr_t start = sz_string_expand(&string_, SZ_SIZE_MAX, length, &alloc);
  2829. if (!start) return false;
  2830. sz_copy(start + old_size, str, length);
  2831. return true;
  2832. });
  2833. }
  2834. template <typename char_type_, typename allocator_>
  2835. template <typename pattern_type>
  2836. bool basic_string<char_type_, allocator_>::try_replace_all_(pattern_type pattern, string_view replacement) noexcept {
  2837. // Depending on the size of the pattern and the replacement, we may need to allocate more space.
  2838. // There are 3 cases to consider:
  2839. // 1. The pattern and the replacement are of the same length. Piece of cake!
  2840. // 2. The pattern is longer than the replacement. We need to compact the strings.
  2841. // 3. The pattern is shorter than the replacement. We may have to allocate more memory.
  2842. using matcher_type = typename std::conditional<std::is_same<pattern_type, char_set>::value,
  2843. matcher_find_first_of<string_view, pattern_type>,
  2844. matcher_find<string_view, exclude_overlaps_type>>::type;
  2845. matcher_type matcher({pattern});
  2846. string_view this_view = view();
  2847. // 1. The pattern and the replacement are of the same length.
  2848. if (matcher.needle_length() == replacement.length()) {
  2849. using matches_type = range_matches<string_view, matcher_type>;
  2850. // Instead of iterating with `begin()` and `end()`, we could use the cheaper sentinel-based approach.
  2851. // for (string_view match : matches) { ... }
  2852. matches_type matches = matches_type(this_view, {pattern});
  2853. for (auto matches_iterator = matches.begin(); matches_iterator != end_sentinel_type {}; ++matches_iterator) {
  2854. replacement.copy(const_cast<pointer>((*matches_iterator).data()));
  2855. }
  2856. return true;
  2857. }
  2858. // 2. The pattern is longer than the replacement. We need to compact the strings.
  2859. else if (matcher.needle_length() > replacement.length()) {
  2860. // Dealing with shorter replacements, we will avoid memory allocations, but we can also minimize the number
  2861. // of `memmove`-s, by keeping one more iterator, pointing to the end of the last compacted area.
  2862. // Having the split-ranges, however, we reuse their logic.
  2863. using splits_type = range_splits<string_view, matcher_type>;
  2864. splits_type splits = splits_type(this_view, {pattern});
  2865. auto matches_iterator = splits.begin();
  2866. auto compacted_end = (*matches_iterator).end();
  2867. if (compacted_end == end()) return true; // No matches.
  2868. ++matches_iterator; // Skip the first match.
  2869. do {
  2870. string_view match_view = *matches_iterator;
  2871. replacement.copy(const_cast<pointer>(compacted_end));
  2872. compacted_end += replacement.length();
  2873. sz_move((sz_ptr_t)compacted_end, match_view.begin(), match_view.length());
  2874. compacted_end += match_view.length();
  2875. ++matches_iterator;
  2876. } while (matches_iterator != end_sentinel_type {});
  2877. // Can't fail, so let's just return true :)
  2878. try_resize(compacted_end - begin());
  2879. return true;
  2880. }
  2881. // 3. The pattern is shorter than the replacement. We may have to allocate more memory.
  2882. else {
  2883. using rmatcher_type = typename std::conditional<std::is_same<pattern_type, char_set>::value,
  2884. matcher_find_last_of<string_view, pattern_type>,
  2885. matcher_rfind<string_view, exclude_overlaps_type>>::type;
  2886. using rmatches_type = range_rmatches<string_view, rmatcher_type>;
  2887. rmatches_type rmatches = rmatches_type(this_view, {pattern});
  2888. // It's cheaper to iterate through the whole string once, counting the number of matches,
  2889. // reserving memory once, than re-allocating and copying the string multiple times.
  2890. auto matches_count = rmatches.size();
  2891. if (matches_count == 0) return true; // No matches.
  2892. // TODO: Resize without initializing the memory.
  2893. auto replacement_delta_length = replacement.length() - matcher.needle_length();
  2894. auto added_length = matches_count * replacement_delta_length;
  2895. auto old_length = size();
  2896. auto new_length = old_length + added_length;
  2897. if (!try_resize(new_length)) return false;
  2898. this_view = view().front(old_length);
  2899. // Now iterate through splits similarly to the 2nd case, but in reverse order.
  2900. using rsplits_type = range_rsplits<string_view, rmatcher_type>;
  2901. rsplits_type splits = rsplits_type(this_view, {pattern});
  2902. auto splits_iterator = splits.begin();
  2903. // Put the compacted pointer to the end of the new string, and walk left.
  2904. auto compacted_begin = this_view.data() + new_length;
  2905. // By now we know that at least one match exists, which means the splits .
  2906. do {
  2907. string_view slice_view = *splits_iterator;
  2908. compacted_begin -= slice_view.length();
  2909. sz_move((sz_ptr_t)compacted_begin, slice_view.begin(), slice_view.length());
  2910. compacted_begin -= replacement.length();
  2911. replacement.copy(const_cast<pointer>(compacted_begin));
  2912. ++splits_iterator;
  2913. } while (!splits_iterator.is_last());
  2914. return true;
  2915. }
  2916. }
  2917. template <typename char_type_, typename allocator_>
  2918. template <typename first_type, typename second_type>
  2919. bool basic_string<char_type_, allocator_>::try_assign(concatenation<first_type, second_type> const &other) noexcept {
  2920. // We can't just assign the other string state, as its start address may be somewhere else on the stack.
  2921. sz_ptr_t string_start;
  2922. sz_size_t string_length;
  2923. sz_string_range(&string_, &string_start, &string_length);
  2924. if (string_length >= other.length()) {
  2925. sz_string_erase(&string_, other.length(), SZ_SIZE_MAX);
  2926. other.copy(string_start, other.length());
  2927. }
  2928. else {
  2929. if (!_with_alloc([&](sz_alloc_type &alloc) {
  2930. string_start = sz_string_expand(&string_, SZ_SIZE_MAX, other.length(), &alloc);
  2931. if (!string_start) return false;
  2932. other.copy(string_start, other.length());
  2933. return true;
  2934. }))
  2935. return false;
  2936. }
  2937. return true;
  2938. }
  2939. template <typename char_type_, typename allocator_>
  2940. bool basic_string<char_type_, allocator_>::try_preparing_replacement(size_type offset, size_type length,
  2941. size_type replacement_length) noexcept {
  2942. // There are three cases:
  2943. // 1. The replacement is the same length as the replaced range.
  2944. // 2. The replacement is shorter than the replaced range.
  2945. // 3. The replacement is longer than the replaced range. An allocation may occur.
  2946. assert(offset + length <= size());
  2947. // 1. The replacement is the same length as the replaced range.
  2948. if (replacement_length == length) { return true; }
  2949. // 2. The replacement is shorter than the replaced range.
  2950. else if (replacement_length < length) {
  2951. sz_string_erase(&string_, offset + replacement_length, length - replacement_length);
  2952. return true;
  2953. }
  2954. // 3. The replacement is longer than the replaced range. An allocation may occur.
  2955. else {
  2956. return _with_alloc([&](sz_alloc_type &alloc) {
  2957. return sz_string_expand(&string_, offset + length, replacement_length - length, &alloc);
  2958. });
  2959. }
  2960. }
  2961. /** @brief SFINAE-type used to infer the resulting type of concatenating multiple string together. */
  2962. template <typename... args_types>
  2963. struct concatenation_result {};
  2964. template <typename first_type, typename second_type>
  2965. struct concatenation_result<first_type, second_type> {
  2966. using type = concatenation<first_type, second_type>;
  2967. };
  2968. template <typename first_type, typename... following_types>
  2969. struct concatenation_result<first_type, following_types...> {
  2970. using type = concatenation<first_type, typename concatenation_result<following_types...>::type>;
  2971. };
  2972. /**
  2973. * @brief Concatenates two strings into a template expression.
  2974. * @see `concatenation` class for more details.
  2975. */
  2976. template <typename first_type, typename second_type>
  2977. concatenation<first_type, second_type> concatenate(first_type &&first, second_type &&second) noexcept(false) {
  2978. return {first, second};
  2979. }
  2980. /**
  2981. * @brief Concatenates two or more strings into a template expression.
  2982. * @see `concatenation` class for more details.
  2983. */
  2984. template <typename first_type, typename second_type, typename... following_types>
  2985. typename concatenation_result<first_type, second_type, following_types...>::type concatenate(
  2986. first_type &&first, second_type &&second, following_types &&...following) noexcept(false) {
  2987. // Fold expression like the one below would result in faster compile times,
  2988. // but would incur the penalty of additional `if`-statements in every `append` call.
  2989. // Moreover, those are only supported in C++17 and later.
  2990. // std::size_t total_size = (strings.size() + ... + 0);
  2991. // std::string result;
  2992. // result.reserve(total_size);
  2993. // (result.append(strings), ...);
  2994. return ashvardanian::stringzilla::concatenate(
  2995. std::forward<first_type>(first),
  2996. ashvardanian::stringzilla::concatenate(std::forward<second_type>(second),
  2997. std::forward<following_types>(following)...));
  2998. }
  2999. /**
  3000. * @brief Calculates the Hamming edit distance in @b bytes between two strings.
  3001. * @see sz_edit_distance
  3002. */
  3003. template <typename char_type_>
  3004. std::size_t hamming_distance(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
  3005. std::size_t bound = 0) noexcept {
  3006. return sz_hamming_distance(a.data(), a.size(), b.data(), b.size(), bound);
  3007. }
  3008. /**
  3009. * @brief Calculates the Hamming edit distance in @b bytes between two strings.
  3010. * @see sz_edit_distance
  3011. */
  3012. template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
  3013. std::size_t hamming_distance(basic_string<char_type_, allocator_type_> const &a,
  3014. basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept {
  3015. return ashvardanian::stringzilla::hamming_distance(a.view(), b.view(), bound);
  3016. }
  3017. /**
  3018. * @brief Calculates the Hamming edit distance in @b unicode codepoints between two strings.
  3019. * @see sz_hamming_distance_utf8
  3020. */
  3021. template <typename char_type_>
  3022. std::size_t hamming_distance_utf8(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
  3023. std::size_t bound = 0) noexcept {
  3024. return sz_hamming_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound);
  3025. }
  3026. /**
  3027. * @brief Calculates the Hamming edit distance in @b unicode codepoints between two strings.
  3028. * @see sz_edit_distance
  3029. */
  3030. template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
  3031. std::size_t hamming_distance_utf8(basic_string<char_type_, allocator_type_> const &a,
  3032. basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept {
  3033. return ashvardanian::stringzilla::hamming_distance_utf8(a.view(), b.view(), bound);
  3034. }
  3035. /**
  3036. * @brief Calculates the Levenshtein edit distance in @b bytes between two strings.
  3037. * @see sz_edit_distance
  3038. */
  3039. template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
  3040. std::size_t edit_distance(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
  3041. std::size_t bound = 0, allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
  3042. std::size_t result;
  3043. if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
  3044. result = sz_edit_distance(a.data(), a.size(), b.data(), b.size(), bound, &alloc);
  3045. return result != SZ_SIZE_MAX;
  3046. }))
  3047. throw std::bad_alloc();
  3048. return result;
  3049. }
  3050. /**
  3051. * @brief Calculates the Levenshtein edit distance in @b bytes between two strings.
  3052. * @see sz_edit_distance
  3053. */
  3054. template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
  3055. std::size_t edit_distance(basic_string<char_type_, allocator_type_> const &a,
  3056. basic_string<char_type_, allocator_type_> const &b, std::size_t bound = 0) noexcept(false) {
  3057. return ashvardanian::stringzilla::edit_distance(a.view(), b.view(), bound, a.get_allocator());
  3058. }
  3059. /**
  3060. * @brief Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
  3061. * @see sz_edit_distance_utf8
  3062. */
  3063. template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
  3064. std::size_t edit_distance_utf8(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
  3065. std::size_t bound = 0,
  3066. allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
  3067. std::size_t result;
  3068. if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
  3069. result = sz_edit_distance_utf8(a.data(), a.size(), b.data(), b.size(), bound, &alloc);
  3070. return result != SZ_SIZE_MAX;
  3071. }))
  3072. throw std::bad_alloc();
  3073. return result;
  3074. }
  3075. /**
  3076. * @brief Calculates the Levenshtein edit distance in @b unicode codepoints between two strings.
  3077. * @see sz_edit_distance_utf8
  3078. */
  3079. template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
  3080. std::size_t edit_distance_utf8(basic_string<char_type_, allocator_type_> const &a,
  3081. basic_string<char_type_, allocator_type_> const &b,
  3082. std::size_t bound = 0) noexcept(false) {
  3083. return ashvardanian::stringzilla::edit_distance_utf8(a.view(), b.view(), bound, a.get_allocator());
  3084. }
  3085. /**
  3086. * @brief Calculates the Needleman-Wunsch alignment score between two strings.
  3087. * @see sz_alignment_score
  3088. */
  3089. template <typename char_type_, typename allocator_type_ = std::allocator<typename std::remove_const<char_type_>::type>>
  3090. std::ptrdiff_t alignment_score(basic_string_slice<char_type_> const &a, basic_string_slice<char_type_> const &b,
  3091. std::int8_t const (&subs)[256][256], std::int8_t gap = -1,
  3092. allocator_type_ &&allocator = allocator_type_ {}) noexcept(false) {
  3093. static_assert(sizeof(sz_error_cost_t) == sizeof(std::int8_t), "sz_error_cost_t must be 8-bit.");
  3094. static_assert(std::is_signed<sz_error_cost_t>() == std::is_signed<std::int8_t>(),
  3095. "sz_error_cost_t must be signed.");
  3096. std::ptrdiff_t result;
  3097. if (!_with_alloc(allocator, [&](sz_memory_allocator_t &alloc) {
  3098. result = sz_alignment_score(a.data(), a.size(), b.data(), b.size(), &subs[0][0], gap, &alloc);
  3099. return result != SZ_SSIZE_MAX;
  3100. }))
  3101. throw std::bad_alloc();
  3102. return result;
  3103. }
  3104. /**
  3105. * @brief Calculates the Needleman-Wunsch alignment score between two strings.
  3106. * @see sz_alignment_score
  3107. */
  3108. template <typename char_type_, typename allocator_type_ = std::allocator<char_type_>>
  3109. std::ptrdiff_t alignment_score(basic_string<char_type_, allocator_type_> const &a,
  3110. basic_string<char_type_, allocator_type_> const &b, //
  3111. std::int8_t const (&subs)[256][256], std::int8_t gap = -1) noexcept(false) {
  3112. return ashvardanian::stringzilla::alignment_score(a.view(), b.view(), subs, gap, a.get_allocator());
  3113. }
  3114. /**
  3115. * @brief Overwrites the string slice with random characters from the given alphabet using the random generator.
  3116. *
  3117. * @param string The string to overwrite.
  3118. * @param generator A random generator function object that returns a random number in the range [0, 2^64).
  3119. * @param alphabet A string of characters to choose from.
  3120. */
  3121. template <typename char_type_, typename generator_type_>
  3122. void randomize(basic_string_slice<char_type_> string, generator_type_ &generator,
  3123. string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
  3124. static_assert(!std::is_const<char_type_>::value, "The string must be mutable.");
  3125. sz_random_generator_t generator_callback = &_call_random_generator<generator_type_>;
  3126. sz_generate(alphabet.data(), alphabet.size(), string.data(), string.size(), generator_callback, &generator);
  3127. }
  3128. /**
  3129. * @brief Overwrites the string slice with random characters from the given alphabet
  3130. * using `std::rand` as the random generator.
  3131. *
  3132. * @param string The string to overwrite.
  3133. * @param alphabet A string of characters to choose from.
  3134. */
  3135. template <typename char_type_>
  3136. void randomize(basic_string_slice<char_type_> string, string_view alphabet = "abcdefghijklmnopqrstuvwxyz") noexcept {
  3137. randomize(string, &std::rand, alphabet);
  3138. }
  3139. using sorted_idx_t = sz_sorted_idx_t;
  3140. /**
  3141. * @brief Internal data-structure used to forward the arguments to the `sz_sort` function.
  3142. * @see sorted_order
  3143. */
  3144. template <typename objects_type_, typename string_extractor_>
  3145. struct _sequence_args {
  3146. objects_type_ const *begin;
  3147. std::size_t count;
  3148. sorted_idx_t *order;
  3149. string_extractor_ extractor;
  3150. };
  3151. template <typename objects_type_, typename string_extractor_>
  3152. sz_cptr_t _call_sequence_member_start(struct sz_sequence_t const *sequence, sz_size_t i) {
  3153. using handle_type = _sequence_args<objects_type_, string_extractor_>;
  3154. handle_type const *args = reinterpret_cast<handle_type const *>(sequence->handle);
  3155. string_view member = args->extractor(args->begin[i]);
  3156. return member.data();
  3157. }
  3158. template <typename objects_type_, typename string_extractor_>
  3159. sz_size_t _call_sequence_member_length(struct sz_sequence_t const *sequence, sz_size_t i) {
  3160. using handle_type = _sequence_args<objects_type_, string_extractor_>;
  3161. handle_type const *args = reinterpret_cast<handle_type const *>(sequence->handle);
  3162. string_view member = args->extractor(args->begin[i]);
  3163. return static_cast<sz_size_t>(member.size());
  3164. }
  3165. /**
  3166. * @brief Computes the permutation of an array, that would lead to sorted order.
  3167. * The elements of the array must be convertible to a `string_view` with the given extractor.
  3168. * Unlike the `sz_sort` C interface, overwrites the output array.
  3169. *
  3170. * @param[in] begin The pointer to the first element of the array.
  3171. * @param[in] end The pointer to the element after the last element of the array.
  3172. * @param[out] order The pointer to the output array of indices, that will be populated with the permutation.
  3173. * @param[in] extractor The function object that extracts the string from the object.
  3174. *
  3175. * @see sz_sort
  3176. */
  3177. template <typename objects_type_, typename string_extractor_>
  3178. void sorted_order(objects_type_ const *begin, objects_type_ const *end, sorted_idx_t *order,
  3179. string_extractor_ &&extractor) noexcept {
  3180. // Pack the arguments into a single structure to reference it from the callback.
  3181. _sequence_args<objects_type_, string_extractor_> args = {begin, static_cast<std::size_t>(end - begin), order,
  3182. std::forward<string_extractor_>(extractor)};
  3183. // Populate the array with `iota`-style order.
  3184. for (std::size_t i = 0; i != args.count; ++i) order[i] = static_cast<sorted_idx_t>(i);
  3185. sz_sequence_t array;
  3186. array.order = reinterpret_cast<sorted_idx_t *>(order);
  3187. array.count = args.count;
  3188. array.handle = &args;
  3189. array.get_start = _call_sequence_member_start<objects_type_, string_extractor_>;
  3190. array.get_length = _call_sequence_member_length<objects_type_, string_extractor_>;
  3191. sz_sort(&array);
  3192. }
  3193. #if !SZ_AVOID_STL
  3194. /**
  3195. * @brief Computes the Rabin-Karp-like rolling binary fingerprint of a string.
  3196. * @see sz_hashes
  3197. */
  3198. template <std::size_t bitset_bits_, typename char_type_>
  3199. void hashes_fingerprint(basic_string_slice<char_type_> const &str, std::size_t window_length,
  3200. std::bitset<bitset_bits_> &fingerprint) noexcept {
  3201. constexpr std::size_t fingerprint_bytes = sizeof(std::bitset<bitset_bits_>);
  3202. return sz_hashes_fingerprint(str.data(), str.size(), window_length, (sz_ptr_t)&fingerprint, fingerprint_bytes);
  3203. }
  3204. /**
  3205. * @brief Computes the Rabin-Karp-like rolling binary fingerprint of a string.
  3206. * @see sz_hashes
  3207. */
  3208. template <std::size_t bitset_bits_, typename char_type_>
  3209. std::bitset<bitset_bits_> hashes_fingerprint(basic_string_slice<char_type_> const &str,
  3210. std::size_t window_length) noexcept {
  3211. std::bitset<bitset_bits_> fingerprint;
  3212. ashvardanian::stringzilla::hashes_fingerprint(str, window_length, fingerprint);
  3213. return fingerprint;
  3214. }
  3215. /**
  3216. * @brief Computes the Rabin-Karp-like rolling binary fingerprint of a string.
  3217. * @see sz_hashes
  3218. */
  3219. template <std::size_t bitset_bits_, typename char_type_>
  3220. std::bitset<bitset_bits_> hashes_fingerprint(basic_string<char_type_> const &str, std::size_t window_length) noexcept {
  3221. return ashvardanian::stringzilla::hashes_fingerprint<bitset_bits_>(str.view(), window_length);
  3222. }
  3223. /**
  3224. * @brief Computes the permutation of an array, that would lead to sorted order.
  3225. * @return The array of indices, that will be populated with the permutation.
  3226. * @throw `std::bad_alloc` if the allocation fails.
  3227. */
  3228. template <typename objects_type_, typename string_extractor_>
  3229. std::vector<sorted_idx_t> sorted_order(objects_type_ const *begin, objects_type_ const *end,
  3230. string_extractor_ &&extractor) noexcept(false) {
  3231. std::vector<sorted_idx_t> order(end - begin);
  3232. sorted_order(begin, end, order.data(), std::forward<string_extractor_>(extractor));
  3233. return order;
  3234. }
  3235. /**
  3236. * @brief Computes the permutation of an array, that would lead to sorted order.
  3237. * @return The array of indices, that will be populated with the permutation.
  3238. * @throw `std::bad_alloc` if the allocation fails.
  3239. */
  3240. template <typename string_like_type_>
  3241. std::vector<sorted_idx_t> sorted_order(string_like_type_ const *begin, string_like_type_ const *end) noexcept(false) {
  3242. static_assert(std::is_convertible<string_like_type_, string_view>::value,
  3243. "The type must be convertible to string_view.");
  3244. return sorted_order(begin, end, [](string_like_type_ const &s) -> string_view { return s; });
  3245. }
  3246. /**
  3247. * @brief Computes the permutation of an array, that would lead to sorted order.
  3248. * @return The array of indices, that will be populated with the permutation.
  3249. * @throw `std::bad_alloc` if the allocation fails.
  3250. */
  3251. template <typename string_like_type_>
  3252. std::vector<sorted_idx_t> sorted_order(std::vector<string_like_type_> const &array) noexcept(false) {
  3253. static_assert(std::is_convertible<string_like_type_, string_view>::value,
  3254. "The type must be convertible to string_view.");
  3255. return sorted_order(array.data(), array.data() + array.size(),
  3256. [](string_like_type_ const &s) -> string_view { return s; });
  3257. }
  3258. #endif
  3259. } // namespace stringzilla
  3260. } // namespace ashvardanian
  3261. #pragma region STL Specializations
  3262. namespace std {
  3263. template <>
  3264. struct hash<ashvardanian::stringzilla::string_view> {
  3265. size_t operator()(ashvardanian::stringzilla::string_view str) const noexcept { return str.hash(); }
  3266. };
  3267. template <>
  3268. struct hash<ashvardanian::stringzilla::string> {
  3269. size_t operator()(ashvardanian::stringzilla::string const &str) const noexcept { return str.hash(); }
  3270. };
  3271. } // namespace std
  3272. #pragma endregion
  3273. #endif // STRINGZILLA_HPP_