You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lpcode.c 31KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014
  1. /*
  2. ** $Id: lpcode.c $
  3. ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license)
  4. */
  5. #include <limits.h>
  6. #include "lua.h"
  7. #include "lauxlib.h"
  8. #include "lptypes.h"
  9. #include "lpcode.h"
  10. /* signals a "no-instruction */
  11. #define NOINST -1
  12. static const Charset fullset_ =
  13. {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  14. 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  15. 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  16. 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}};
  17. static const Charset *fullset = &fullset_;
  18. /*
  19. ** {======================================================
  20. ** Analysis and some optimizations
  21. ** =======================================================
  22. */
  23. /*
  24. ** Check whether a charset is empty (returns IFail), singleton (IChar),
  25. ** full (IAny), or none of those (ISet). When singleton, '*c' returns
  26. ** which character it is. (When generic set, the set was the input,
  27. ** so there is no need to return it.)
  28. */
  29. static Opcode charsettype (const byte *cs, int *c) {
  30. int count = 0; /* number of characters in the set */
  31. int i;
  32. int candidate = -1; /* candidate position for the singleton char */
  33. for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */
  34. int b = cs[i];
  35. if (b == 0) { /* is byte empty? */
  36. if (count > 1) /* was set neither empty nor singleton? */
  37. return ISet; /* neither full nor empty nor singleton */
  38. /* else set is still empty or singleton */
  39. }
  40. else if (b == 0xFF) { /* is byte full? */
  41. if (count < (i * BITSPERCHAR)) /* was set not full? */
  42. return ISet; /* neither full nor empty nor singleton */
  43. else count += BITSPERCHAR; /* set is still full */
  44. }
  45. else if ((b & (b - 1)) == 0) { /* has byte only one bit? */
  46. if (count > 0) /* was set not empty? */
  47. return ISet; /* neither full nor empty nor singleton */
  48. else { /* set has only one char till now; track it */
  49. count++;
  50. candidate = i;
  51. }
  52. }
  53. else return ISet; /* byte is neither empty, full, nor singleton */
  54. }
  55. switch (count) {
  56. case 0: return IFail; /* empty set */
  57. case 1: { /* singleton; find character bit inside byte */
  58. int b = cs[candidate];
  59. *c = candidate * BITSPERCHAR;
  60. if ((b & 0xF0) != 0) { *c += 4; b >>= 4; }
  61. if ((b & 0x0C) != 0) { *c += 2; b >>= 2; }
  62. if ((b & 0x02) != 0) { *c += 1; }
  63. return IChar;
  64. }
  65. default: {
  66. assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */
  67. return IAny;
  68. }
  69. }
  70. }
  71. /*
  72. ** A few basic operations on Charsets
  73. */
  74. static void cs_complement (Charset *cs) {
  75. loopset(i, cs->cs[i] = ~cs->cs[i]);
  76. }
  77. static int cs_equal (const byte *cs1, const byte *cs2) {
  78. loopset(i, if (cs1[i] != cs2[i]) return 0);
  79. return 1;
  80. }
  81. static int cs_disjoint (const Charset *cs1, const Charset *cs2) {
  82. loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;)
  83. return 1;
  84. }
  85. /*
  86. ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a
  87. ** charset and return 1; else return 0.
  88. */
  89. int tocharset (TTree *tree, Charset *cs) {
  90. switch (tree->tag) {
  91. case TSet: { /* copy set */
  92. loopset(i, cs->cs[i] = treebuffer(tree)[i]);
  93. return 1;
  94. }
  95. case TChar: { /* only one char */
  96. assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX);
  97. loopset(i, cs->cs[i] = 0); /* erase all chars */
  98. setchar(cs->cs, tree->u.n); /* add that one */
  99. return 1;
  100. }
  101. case TAny: {
  102. loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */
  103. return 1;
  104. }
  105. default: return 0;
  106. }
  107. }
  108. /*
  109. ** Visit a TCall node taking care to stop recursion. If node not yet
  110. ** visited, return 'f(sib2(tree))', otherwise return 'def' (default
  111. ** value)
  112. */
  113. static int callrecursive (TTree *tree, int f (TTree *t), int def) {
  114. int key = tree->key;
  115. assert(tree->tag == TCall);
  116. assert(sib2(tree)->tag == TRule);
  117. if (key == 0) /* node already visited? */
  118. return def; /* return default value */
  119. else { /* first visit */
  120. int result;
  121. tree->key = 0; /* mark call as already visited */
  122. result = f(sib2(tree)); /* go to called rule */
  123. tree->key = key; /* restore tree */
  124. return result;
  125. }
  126. }
  127. /*
  128. ** Check whether a pattern tree has captures
  129. */
  130. int hascaptures (TTree *tree) {
  131. tailcall:
  132. switch (tree->tag) {
  133. case TCapture: case TRunTime:
  134. return 1;
  135. case TCall:
  136. return callrecursive(tree, hascaptures, 0);
  137. case TRule: /* do not follow siblings */
  138. tree = sib1(tree); goto tailcall;
  139. case TOpenCall: assert(0);
  140. default: {
  141. switch (numsiblings[tree->tag]) {
  142. case 1: /* return hascaptures(sib1(tree)); */
  143. tree = sib1(tree); goto tailcall;
  144. case 2:
  145. if (hascaptures(sib1(tree)))
  146. return 1;
  147. /* else return hascaptures(sib2(tree)); */
  148. tree = sib2(tree); goto tailcall;
  149. default: assert(numsiblings[tree->tag] == 0); return 0;
  150. }
  151. }
  152. }
  153. }
  154. /*
  155. ** Checks how a pattern behaves regarding the empty string,
  156. ** in one of two different ways:
  157. ** A pattern is *nullable* if it can match without consuming any character;
  158. ** A pattern is *nofail* if it never fails for any string
  159. ** (including the empty string).
  160. ** The difference is only for predicates and run-time captures;
  161. ** for other patterns, the two properties are equivalent.
  162. ** (With predicates, &'a' is nullable but not nofail. Of course,
  163. ** nofail => nullable.)
  164. ** These functions are all convervative in the following way:
  165. ** p is nullable => nullable(p)
  166. ** nofail(p) => p cannot fail
  167. ** The function assumes that TOpenCall is not nullable;
  168. ** this will be checked again when the grammar is fixed.
  169. ** Run-time captures can do whatever they want, so the result
  170. ** is conservative.
  171. */
  172. int checkaux (TTree *tree, int pred) {
  173. tailcall:
  174. switch (tree->tag) {
  175. case TChar: case TSet: case TAny:
  176. case TFalse: case TOpenCall:
  177. return 0; /* not nullable */
  178. case TRep: case TTrue:
  179. return 1; /* no fail */
  180. case TNot: case TBehind: /* can match empty, but can fail */
  181. if (pred == PEnofail) return 0;
  182. else return 1; /* PEnullable */
  183. case TAnd: /* can match empty; fail iff body does */
  184. if (pred == PEnullable) return 1;
  185. /* else return checkaux(sib1(tree), pred); */
  186. tree = sib1(tree); goto tailcall;
  187. case TRunTime: /* can fail; match empty iff body does */
  188. if (pred == PEnofail) return 0;
  189. /* else return checkaux(sib1(tree), pred); */
  190. tree = sib1(tree); goto tailcall;
  191. case TSeq:
  192. if (!checkaux(sib1(tree), pred)) return 0;
  193. /* else return checkaux(sib2(tree), pred); */
  194. tree = sib2(tree); goto tailcall;
  195. case TChoice:
  196. if (checkaux(sib2(tree), pred)) return 1;
  197. /* else return checkaux(sib1(tree), pred); */
  198. tree = sib1(tree); goto tailcall;
  199. case TCapture: case TGrammar: case TRule:
  200. /* return checkaux(sib1(tree), pred); */
  201. tree = sib1(tree); goto tailcall;
  202. case TCall: /* return checkaux(sib2(tree), pred); */
  203. tree = sib2(tree); goto tailcall;
  204. default: assert(0); return 0;
  205. }
  206. }
  207. /*
  208. ** number of characters to match a pattern (or -1 if variable)
  209. */
  210. int fixedlen (TTree *tree) {
  211. int len = 0; /* to accumulate in tail calls */
  212. tailcall:
  213. switch (tree->tag) {
  214. case TChar: case TSet: case TAny:
  215. return len + 1;
  216. case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
  217. return len;
  218. case TRep: case TRunTime: case TOpenCall:
  219. return -1;
  220. case TCapture: case TRule: case TGrammar:
  221. /* return fixedlen(sib1(tree)); */
  222. tree = sib1(tree); goto tailcall;
  223. case TCall: {
  224. int n1 = callrecursive(tree, fixedlen, -1);
  225. if (n1 < 0)
  226. return -1;
  227. else
  228. return len + n1;
  229. }
  230. case TSeq: {
  231. int n1 = fixedlen(sib1(tree));
  232. if (n1 < 0)
  233. return -1;
  234. /* else return fixedlen(sib2(tree)) + len; */
  235. len += n1; tree = sib2(tree); goto tailcall;
  236. }
  237. case TChoice: {
  238. int n1 = fixedlen(sib1(tree));
  239. int n2 = fixedlen(sib2(tree));
  240. if (n1 != n2 || n1 < 0)
  241. return -1;
  242. else
  243. return len + n1;
  244. }
  245. default: assert(0); return 0;
  246. };
  247. }
  248. /*
  249. ** Computes the 'first set' of a pattern.
  250. ** The result is a conservative aproximation:
  251. ** match p ax -> x (for some x) ==> a belongs to first(p)
  252. ** or
  253. ** a not in first(p) ==> match p ax -> fail (for all x)
  254. **
  255. ** The set 'follow' is the first set of what follows the
  256. ** pattern (full set if nothing follows it).
  257. **
  258. ** The function returns 0 when this resulting set can be used for
  259. ** test instructions that avoid the pattern altogether.
  260. ** A non-zero return can happen for two reasons:
  261. ** 1) match p '' -> '' ==> return has bit 1 set
  262. ** (tests cannot be used because they would always fail for an empty input);
  263. ** 2) there is a match-time capture ==> return has bit 2 set
  264. ** (optimizations should not bypass match-time captures).
  265. */
  266. static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
  267. tailcall:
  268. switch (tree->tag) {
  269. case TChar: case TSet: case TAny: {
  270. tocharset(tree, firstset);
  271. return 0;
  272. }
  273. case TTrue: {
  274. loopset(i, firstset->cs[i] = follow->cs[i]);
  275. return 1; /* accepts the empty string */
  276. }
  277. case TFalse: {
  278. loopset(i, firstset->cs[i] = 0);
  279. return 0;
  280. }
  281. case TChoice: {
  282. Charset csaux;
  283. int e1 = getfirst(sib1(tree), follow, firstset);
  284. int e2 = getfirst(sib2(tree), follow, &csaux);
  285. loopset(i, firstset->cs[i] |= csaux.cs[i]);
  286. return e1 | e2;
  287. }
  288. case TSeq: {
  289. if (!nullable(sib1(tree))) {
  290. /* when p1 is not nullable, p2 has nothing to contribute;
  291. return getfirst(sib1(tree), fullset, firstset); */
  292. tree = sib1(tree); follow = fullset; goto tailcall;
  293. }
  294. else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */
  295. Charset csaux;
  296. int e2 = getfirst(sib2(tree), follow, &csaux);
  297. int e1 = getfirst(sib1(tree), &csaux, firstset);
  298. if (e1 == 0) return 0; /* 'e1' ensures that first can be used */
  299. else if ((e1 | e2) & 2) /* one of the children has a matchtime? */
  300. return 2; /* pattern has a matchtime capture */
  301. else return e2; /* else depends on 'e2' */
  302. }
  303. }
  304. case TRep: {
  305. getfirst(sib1(tree), follow, firstset);
  306. loopset(i, firstset->cs[i] |= follow->cs[i]);
  307. return 1; /* accept the empty string */
  308. }
  309. case TCapture: case TGrammar: case TRule: {
  310. /* return getfirst(sib1(tree), follow, firstset); */
  311. tree = sib1(tree); goto tailcall;
  312. }
  313. case TRunTime: { /* function invalidates any follow info. */
  314. int e = getfirst(sib1(tree), fullset, firstset);
  315. if (e) return 2; /* function is not "protected"? */
  316. else return 0; /* pattern inside capture ensures first can be used */
  317. }
  318. case TCall: {
  319. /* return getfirst(sib2(tree), follow, firstset); */
  320. tree = sib2(tree); goto tailcall;
  321. }
  322. case TAnd: {
  323. int e = getfirst(sib1(tree), follow, firstset);
  324. loopset(i, firstset->cs[i] &= follow->cs[i]);
  325. return e;
  326. }
  327. case TNot: {
  328. if (tocharset(sib1(tree), firstset)) {
  329. cs_complement(firstset);
  330. return 1;
  331. }
  332. /* else go through */
  333. }
  334. case TBehind: { /* instruction gives no new information */
  335. /* call 'getfirst' only to check for math-time captures */
  336. int e = getfirst(sib1(tree), follow, firstset);
  337. loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */
  338. return e | 1; /* always can accept the empty string */
  339. }
  340. default: assert(0); return 0;
  341. }
  342. }
  343. /*
  344. ** If 'headfail(tree)' true, then 'tree' can fail only depending on the
  345. ** next character of the subject.
  346. */
  347. static int headfail (TTree *tree) {
  348. tailcall:
  349. switch (tree->tag) {
  350. case TChar: case TSet: case TAny: case TFalse:
  351. return 1;
  352. case TTrue: case TRep: case TRunTime: case TNot:
  353. case TBehind:
  354. return 0;
  355. case TCapture: case TGrammar: case TRule: case TAnd:
  356. tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */
  357. case TCall:
  358. tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */
  359. case TSeq:
  360. if (!nofail(sib2(tree))) return 0;
  361. /* else return headfail(sib1(tree)); */
  362. tree = sib1(tree); goto tailcall;
  363. case TChoice:
  364. if (!headfail(sib1(tree))) return 0;
  365. /* else return headfail(sib2(tree)); */
  366. tree = sib2(tree); goto tailcall;
  367. default: assert(0); return 0;
  368. }
  369. }
  370. /*
  371. ** Check whether the code generation for the given tree can benefit
  372. ** from a follow set (to avoid computing the follow set when it is
  373. ** not needed)
  374. */
  375. static int needfollow (TTree *tree) {
  376. tailcall:
  377. switch (tree->tag) {
  378. case TChar: case TSet: case TAny:
  379. case TFalse: case TTrue: case TAnd: case TNot:
  380. case TRunTime: case TGrammar: case TCall: case TBehind:
  381. return 0;
  382. case TChoice: case TRep:
  383. return 1;
  384. case TCapture:
  385. tree = sib1(tree); goto tailcall;
  386. case TSeq:
  387. tree = sib2(tree); goto tailcall;
  388. default: assert(0); return 0;
  389. }
  390. }
  391. /* }====================================================== */
  392. /*
  393. ** {======================================================
  394. ** Code generation
  395. ** =======================================================
  396. */
  397. /*
  398. ** size of an instruction
  399. */
  400. int sizei (const Instruction *i) {
  401. switch((Opcode)i->i.code) {
  402. case ISet: case ISpan: return CHARSETINSTSIZE;
  403. case ITestSet: return CHARSETINSTSIZE + 1;
  404. case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall:
  405. case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit:
  406. return 2;
  407. default: return 1;
  408. }
  409. }
  410. /*
  411. ** state for the compiler
  412. */
  413. typedef struct CompileState {
  414. Pattern *p; /* pattern being compiled */
  415. int ncode; /* next position in p->code to be filled */
  416. lua_State *L;
  417. } CompileState;
  418. /*
  419. ** code generation is recursive; 'opt' indicates that the code is being
  420. ** generated as the last thing inside an optional pattern (so, if that
  421. ** code is optional too, it can reuse the 'IChoice' already in place for
  422. ** the outer pattern). 'tt' points to a previous test protecting this
  423. ** code (or NOINST). 'fl' is the follow set of the pattern.
  424. */
  425. static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
  426. const Charset *fl);
  427. void realloccode (lua_State *L, Pattern *p, int nsize) {
  428. void *ud;
  429. lua_Alloc f = lua_getallocf(L, &ud);
  430. void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction),
  431. nsize * sizeof(Instruction));
  432. if (newblock == NULL && nsize > 0)
  433. luaL_error(L, "not enough memory");
  434. p->code = (Instruction *)newblock;
  435. p->codesize = nsize;
  436. }
  437. static int nextinstruction (CompileState *compst) {
  438. int size = compst->p->codesize;
  439. if (compst->ncode >= size)
  440. realloccode(compst->L, compst->p, size * 2);
  441. return compst->ncode++;
  442. }
  443. #define getinstr(cs,i) ((cs)->p->code[i])
  444. static int addinstruction (CompileState *compst, Opcode op, int aux) {
  445. int i = nextinstruction(compst);
  446. getinstr(compst, i).i.code = op;
  447. getinstr(compst, i).i.aux = aux;
  448. return i;
  449. }
  450. /*
  451. ** Add an instruction followed by space for an offset (to be set later)
  452. */
  453. static int addoffsetinst (CompileState *compst, Opcode op) {
  454. int i = addinstruction(compst, op, 0); /* instruction */
  455. addinstruction(compst, (Opcode)0, 0); /* open space for offset */
  456. assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2);
  457. return i;
  458. }
  459. /*
  460. ** Set the offset of an instruction
  461. */
  462. static void setoffset (CompileState *compst, int instruction, int offset) {
  463. getinstr(compst, instruction + 1).offset = offset;
  464. }
  465. /*
  466. ** Add a capture instruction:
  467. ** 'op' is the capture instruction; 'cap' the capture kind;
  468. ** 'key' the key into ktable; 'aux' is the optional capture offset
  469. **
  470. */
  471. static int addinstcap (CompileState *compst, Opcode op, int cap, int key,
  472. int aux) {
  473. int i = addinstruction(compst, op, joinkindoff(cap, aux));
  474. getinstr(compst, i).i.key = key;
  475. return i;
  476. }
  477. #define gethere(compst) ((compst)->ncode)
  478. #define target(code,i) ((i) + code[i + 1].offset)
  479. /*
  480. ** Patch 'instruction' to jump to 'target'
  481. */
  482. static void jumptothere (CompileState *compst, int instruction, int target) {
  483. if (instruction >= 0)
  484. setoffset(compst, instruction, target - instruction);
  485. }
  486. /*
  487. ** Patch 'instruction' to jump to current position
  488. */
  489. static void jumptohere (CompileState *compst, int instruction) {
  490. jumptothere(compst, instruction, gethere(compst));
  491. }
  492. /*
  493. ** Code an IChar instruction, or IAny if there is an equivalent
  494. ** test dominating it
  495. */
  496. static void codechar (CompileState *compst, int c, int tt) {
  497. if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar &&
  498. getinstr(compst, tt).i.aux == c)
  499. addinstruction(compst, IAny, 0);
  500. else
  501. addinstruction(compst, IChar, c);
  502. }
  503. /*
  504. ** Add a charset posfix to an instruction
  505. */
  506. static void addcharset (CompileState *compst, const byte *cs) {
  507. int p = gethere(compst);
  508. int i;
  509. for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++)
  510. nextinstruction(compst); /* space for buffer */
  511. /* fill buffer with charset */
  512. loopset(j, getinstr(compst, p).buff[j] = cs[j]);
  513. }
  514. /*
  515. ** code a char set, optimizing unit sets for IChar, "complete"
  516. ** sets for IAny, and empty sets for IFail; also use an IAny
  517. ** when instruction is dominated by an equivalent test.
  518. */
  519. static void codecharset (CompileState *compst, const byte *cs, int tt) {
  520. int c = 0; /* (=) to avoid warnings */
  521. Opcode op = charsettype(cs, &c);
  522. switch (op) {
  523. case IChar: codechar(compst, c, tt); break;
  524. case ISet: { /* non-trivial set? */
  525. if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet &&
  526. cs_equal(cs, getinstr(compst, tt + 2).buff))
  527. addinstruction(compst, IAny, 0);
  528. else {
  529. addinstruction(compst, ISet, 0);
  530. addcharset(compst, cs);
  531. }
  532. break;
  533. }
  534. default: addinstruction(compst, op, c); break;
  535. }
  536. }
  537. /*
  538. ** code a test set, optimizing unit sets for ITestChar, "complete"
  539. ** sets for ITestAny, and empty sets for IJmp (always fails).
  540. ** 'e' is true iff test should accept the empty string. (Test
  541. ** instructions in the current VM never accept the empty string.)
  542. */
  543. static int codetestset (CompileState *compst, Charset *cs, int e) {
  544. if (e) return NOINST; /* no test */
  545. else {
  546. int c = 0;
  547. Opcode op = charsettype(cs->cs, &c);
  548. switch (op) {
  549. case IFail: return addoffsetinst(compst, IJmp); /* always jump */
  550. case IAny: return addoffsetinst(compst, ITestAny);
  551. case IChar: {
  552. int i = addoffsetinst(compst, ITestChar);
  553. getinstr(compst, i).i.aux = c;
  554. return i;
  555. }
  556. case ISet: {
  557. int i = addoffsetinst(compst, ITestSet);
  558. addcharset(compst, cs->cs);
  559. return i;
  560. }
  561. default: assert(0); return 0;
  562. }
  563. }
  564. }
  565. /*
  566. ** Find the final destination of a sequence of jumps
  567. */
  568. static int finaltarget (Instruction *code, int i) {
  569. while (code[i].i.code == IJmp)
  570. i = target(code, i);
  571. return i;
  572. }
  573. /*
  574. ** final label (after traversing any jumps)
  575. */
  576. static int finallabel (Instruction *code, int i) {
  577. return finaltarget(code, target(code, i));
  578. }
  579. /*
  580. ** <behind(p)> == behind n; <p> (where n = fixedlen(p))
  581. */
  582. static void codebehind (CompileState *compst, TTree *tree) {
  583. if (tree->u.n > 0)
  584. addinstruction(compst, IBehind, tree->u.n);
  585. codegen(compst, sib1(tree), 0, NOINST, fullset);
  586. }
  587. /*
  588. ** Choice; optimizations:
  589. ** - when p1 is headfail or
  590. ** when first(p1) and first(p2) are disjoint, than
  591. ** a character not in first(p1) cannot go to p1, and a character
  592. ** in first(p1) cannot go to p2 (at it is not in first(p2)).
  593. ** (The optimization is not valid if p1 accepts the empty string,
  594. ** as then there is no character at all...)
  595. ** - when p2 is empty and opt is true; a IPartialCommit can reuse
  596. ** the Choice already active in the stack.
  597. */
  598. static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
  599. const Charset *fl) {
  600. int emptyp2 = (p2->tag == TTrue);
  601. Charset cs1, cs2;
  602. int e1 = getfirst(p1, fullset, &cs1);
  603. if (headfail(p1) ||
  604. (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) {
  605. /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */
  606. int test = codetestset(compst, &cs1, 0);
  607. int jmp = NOINST;
  608. codegen(compst, p1, 0, test, fl);
  609. if (!emptyp2)
  610. jmp = addoffsetinst(compst, IJmp);
  611. jumptohere(compst, test);
  612. codegen(compst, p2, opt, NOINST, fl);
  613. jumptohere(compst, jmp);
  614. }
  615. else if (opt && emptyp2) {
  616. /* p1? == IPartialCommit; p1 */
  617. jumptohere(compst, addoffsetinst(compst, IPartialCommit));
  618. codegen(compst, p1, 1, NOINST, fullset);
  619. }
  620. else {
  621. /* <p1 / p2> ==
  622. test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
  623. int pcommit;
  624. int test = codetestset(compst, &cs1, e1);
  625. int pchoice = addoffsetinst(compst, IChoice);
  626. codegen(compst, p1, emptyp2, test, fullset);
  627. pcommit = addoffsetinst(compst, ICommit);
  628. jumptohere(compst, pchoice);
  629. jumptohere(compst, test);
  630. codegen(compst, p2, opt, NOINST, fl);
  631. jumptohere(compst, pcommit);
  632. }
  633. }
  634. /*
  635. ** And predicate
  636. ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
  637. ** (valid only when 'p' has no captures)
  638. */
  639. static void codeand (CompileState *compst, TTree *tree, int tt) {
  640. int n = fixedlen(tree);
  641. if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) {
  642. codegen(compst, tree, 0, tt, fullset);
  643. if (n > 0)
  644. addinstruction(compst, IBehind, n);
  645. }
  646. else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */
  647. int pcommit;
  648. int pchoice = addoffsetinst(compst, IChoice);
  649. codegen(compst, tree, 0, tt, fullset);
  650. pcommit = addoffsetinst(compst, IBackCommit);
  651. jumptohere(compst, pchoice);
  652. addinstruction(compst, IFail, 0);
  653. jumptohere(compst, pcommit);
  654. }
  655. }
  656. /*
  657. ** Captures: if pattern has fixed (and not too big) length, and it
  658. ** has no nested captures, use a single IFullCapture instruction
  659. ** after the match; otherwise, enclose the pattern with OpenCapture -
  660. ** CloseCapture.
  661. */
  662. static void codecapture (CompileState *compst, TTree *tree, int tt,
  663. const Charset *fl) {
  664. int len = fixedlen(sib1(tree));
  665. if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) {
  666. codegen(compst, sib1(tree), 0, tt, fl);
  667. addinstcap(compst, IFullCapture, tree->cap, tree->key, len);
  668. }
  669. else {
  670. addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0);
  671. codegen(compst, sib1(tree), 0, tt, fl);
  672. addinstcap(compst, ICloseCapture, Cclose, 0, 0);
  673. }
  674. }
  675. static void coderuntime (CompileState *compst, TTree *tree, int tt) {
  676. addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0);
  677. codegen(compst, sib1(tree), 0, tt, fullset);
  678. addinstcap(compst, ICloseRunTime, Cclose, 0, 0);
  679. }
  680. /*
  681. ** Repetion; optimizations:
  682. ** When pattern is a charset, can use special instruction ISpan.
  683. ** When pattern is head fail, or if it starts with characters that
  684. ** are disjoint from what follows the repetions, a simple test
  685. ** is enough (a fail inside the repetition would backtrack to fail
  686. ** again in the following pattern, so there is no need for a choice).
  687. ** When 'opt' is true, the repetion can reuse the Choice already
  688. ** active in the stack.
  689. */
  690. static void coderep (CompileState *compst, TTree *tree, int opt,
  691. const Charset *fl) {
  692. Charset st;
  693. if (tocharset(tree, &st)) {
  694. addinstruction(compst, ISpan, 0);
  695. addcharset(compst, st.cs);
  696. }
  697. else {
  698. int e1 = getfirst(tree, fullset, &st);
  699. if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) {
  700. /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */
  701. int jmp;
  702. int test = codetestset(compst, &st, 0);
  703. codegen(compst, tree, 0, test, fullset);
  704. jmp = addoffsetinst(compst, IJmp);
  705. jumptohere(compst, test);
  706. jumptothere(compst, jmp, test);
  707. }
  708. else {
  709. /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */
  710. /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */
  711. int commit, l2;
  712. int test = codetestset(compst, &st, e1);
  713. int pchoice = NOINST;
  714. if (opt)
  715. jumptohere(compst, addoffsetinst(compst, IPartialCommit));
  716. else
  717. pchoice = addoffsetinst(compst, IChoice);
  718. l2 = gethere(compst);
  719. codegen(compst, tree, 0, NOINST, fullset);
  720. commit = addoffsetinst(compst, IPartialCommit);
  721. jumptothere(compst, commit, l2);
  722. jumptohere(compst, pchoice);
  723. jumptohere(compst, test);
  724. }
  725. }
  726. }
  727. /*
  728. ** Not predicate; optimizations:
  729. ** In any case, if first test fails, 'not' succeeds, so it can jump to
  730. ** the end. If pattern is headfail, that is all (it cannot fail
  731. ** in other parts); this case includes 'not' of simple sets. Otherwise,
  732. ** use the default code (a choice plus a failtwice).
  733. */
  734. static void codenot (CompileState *compst, TTree *tree) {
  735. Charset st;
  736. int e = getfirst(tree, fullset, &st);
  737. int test = codetestset(compst, &st, e);
  738. if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */
  739. addinstruction(compst, IFail, 0);
  740. else {
  741. /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */
  742. int pchoice = addoffsetinst(compst, IChoice);
  743. codegen(compst, tree, 0, NOINST, fullset);
  744. addinstruction(compst, IFailTwice, 0);
  745. jumptohere(compst, pchoice);
  746. }
  747. jumptohere(compst, test);
  748. }
  749. /*
  750. ** change open calls to calls, using list 'positions' to find
  751. ** correct offsets; also optimize tail calls
  752. */
  753. static void correctcalls (CompileState *compst, int *positions,
  754. int from, int to) {
  755. int i;
  756. Instruction *code = compst->p->code;
  757. for (i = from; i < to; i += sizei(&code[i])) {
  758. if (code[i].i.code == IOpenCall) {
  759. int n = code[i].i.key; /* rule number */
  760. int rule = positions[n]; /* rule position */
  761. assert(rule == from || code[rule - 1].i.code == IRet);
  762. if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */
  763. code[i].i.code = IJmp; /* tail call */
  764. else
  765. code[i].i.code = ICall;
  766. jumptothere(compst, i, rule); /* call jumps to respective rule */
  767. }
  768. }
  769. assert(i == to);
  770. }
  771. /*
  772. ** Code for a grammar:
  773. ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2:
  774. */
  775. static void codegrammar (CompileState *compst, TTree *grammar) {
  776. int positions[MAXRULES];
  777. int rulenumber = 0;
  778. TTree *rule;
  779. int firstcall = addoffsetinst(compst, ICall); /* call initial rule */
  780. int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */
  781. int start = gethere(compst); /* here starts the initial rule */
  782. jumptohere(compst, firstcall);
  783. for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
  784. positions[rulenumber++] = gethere(compst); /* save rule position */
  785. codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */
  786. addinstruction(compst, IRet, 0);
  787. }
  788. assert(rule->tag == TTrue);
  789. jumptohere(compst, jumptoend);
  790. correctcalls(compst, positions, start, gethere(compst));
  791. }
  792. static void codecall (CompileState *compst, TTree *call) {
  793. int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */
  794. getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */
  795. assert(sib2(call)->tag == TRule);
  796. }
  797. /*
  798. ** Code first child of a sequence
  799. ** (second child is called in-place to allow tail call)
  800. ** Return 'tt' for second child
  801. */
  802. static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2,
  803. int tt, const Charset *fl) {
  804. if (needfollow(p1)) {
  805. Charset fl1;
  806. getfirst(p2, fl, &fl1); /* p1 follow is p2 first */
  807. codegen(compst, p1, 0, tt, &fl1);
  808. }
  809. else /* use 'fullset' as follow */
  810. codegen(compst, p1, 0, tt, fullset);
  811. if (fixedlen(p1) != 0) /* can 'p1' consume anything? */
  812. return NOINST; /* invalidate test */
  813. else return tt; /* else 'tt' still protects sib2 */
  814. }
  815. /*
  816. ** Main code-generation function: dispatch to auxiliar functions
  817. ** according to kind of tree. ('needfollow' should return true
  818. ** only for consructions that use 'fl'.)
  819. */
  820. static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
  821. const Charset *fl) {
  822. tailcall:
  823. switch (tree->tag) {
  824. case TChar: codechar(compst, tree->u.n, tt); break;
  825. case TAny: addinstruction(compst, IAny, 0); break;
  826. case TSet: codecharset(compst, treebuffer(tree), tt); break;
  827. case TTrue: break;
  828. case TFalse: addinstruction(compst, IFail, 0); break;
  829. case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break;
  830. case TRep: coderep(compst, sib1(tree), opt, fl); break;
  831. case TBehind: codebehind(compst, tree); break;
  832. case TNot: codenot(compst, sib1(tree)); break;
  833. case TAnd: codeand(compst, sib1(tree), tt); break;
  834. case TCapture: codecapture(compst, tree, tt, fl); break;
  835. case TRunTime: coderuntime(compst, tree, tt); break;
  836. case TGrammar: codegrammar(compst, tree); break;
  837. case TCall: codecall(compst, tree); break;
  838. case TSeq: {
  839. tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */
  840. /* codegen(compst, p2, opt, tt, fl); */
  841. tree = sib2(tree); goto tailcall;
  842. }
  843. default: assert(0);
  844. }
  845. }
  846. /*
  847. ** Optimize jumps and other jump-like instructions.
  848. ** * Update labels of instructions with labels to their final
  849. ** destinations (e.g., choice L1; ... L1: jmp L2: becomes
  850. ** choice L2)
  851. ** * Jumps to other instructions that do jumps become those
  852. ** instructions (e.g., jump to return becomes a return; jump
  853. ** to commit becomes a commit)
  854. */
  855. static void peephole (CompileState *compst) {
  856. Instruction *code = compst->p->code;
  857. int i;
  858. for (i = 0; i < compst->ncode; i += sizei(&code[i])) {
  859. redo:
  860. switch (code[i].i.code) {
  861. case IChoice: case ICall: case ICommit: case IPartialCommit:
  862. case IBackCommit: case ITestChar: case ITestSet:
  863. case ITestAny: { /* instructions with labels */
  864. jumptothere(compst, i, finallabel(code, i)); /* optimize label */
  865. break;
  866. }
  867. case IJmp: {
  868. int ft = finaltarget(code, i);
  869. switch (code[ft].i.code) { /* jumping to what? */
  870. case IRet: case IFail: case IFailTwice:
  871. case IEnd: { /* instructions with unconditional implicit jumps */
  872. code[i] = code[ft]; /* jump becomes that instruction */
  873. code[i + 1].i.code = IAny; /* 'no-op' for target position */
  874. break;
  875. }
  876. case ICommit: case IPartialCommit:
  877. case IBackCommit: { /* inst. with unconditional explicit jumps */
  878. int fft = finallabel(code, ft);
  879. code[i] = code[ft]; /* jump becomes that instruction... */
  880. jumptothere(compst, i, fft); /* but must correct its offset */
  881. goto redo; /* reoptimize its label */
  882. }
  883. default: {
  884. jumptothere(compst, i, ft); /* optimize label */
  885. break;
  886. }
  887. }
  888. break;
  889. }
  890. default: break;
  891. }
  892. }
  893. assert(code[i - 1].i.code == IEnd);
  894. }
  895. /*
  896. ** Compile a pattern
  897. */
  898. Instruction *compile (lua_State *L, Pattern *p) {
  899. CompileState compst;
  900. compst.p = p; compst.ncode = 0; compst.L = L;
  901. realloccode(L, p, 2); /* minimum initial size */
  902. codegen(&compst, p->tree, 0, NOINST, fullset);
  903. addinstruction(&compst, IEnd, 0);
  904. realloccode(L, p, compst.ncode); /* set final size */
  905. peephole(&compst);
  906. return p->code;
  907. }
  908. /* }====================================================== */