Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

tokeniser.c 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. #include <stdio.h> /* stderr etc */
  2. #include <stdlib.h> /* malloc free */
  3. #include <string.h> /* strlen */
  4. #include <ctype.h> /* isalpha etc */
  5. #include "header.h"
  6. struct system_word {
  7. int s_size; /* size of system word */
  8. const byte * s; /* pointer to the system word */
  9. int code; /* its internal code */
  10. };
  11. /* ASCII collating assumed in syswords.c */
  12. #include "syswords.h"
  13. static int smaller(int a, int b) { return a < b ? a : b; }
  14. extern symbol * get_input(symbol * p, char ** p_file) {
  15. char * s = b_to_s(p);
  16. {
  17. FILE * input = fopen(s, "r");
  18. if (input == 0) { free(s); return 0; }
  19. *p_file = s;
  20. {
  21. symbol * u = create_b(STARTSIZE);
  22. int size = 0;
  23. repeat
  24. { int ch = getc(input);
  25. if (ch == EOF) break;
  26. if (size >= CAPACITY(u)) u = increase_capacity(u, size/2);
  27. u[size++] = ch;
  28. }
  29. fclose(input);
  30. SIZE(u) = size; return u;
  31. }
  32. }
  33. }
  34. static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) {
  35. if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
  36. fprintf(stderr, "%s:%d: ", t->file, t->line_number);
  37. unless (s1 == 0) fprintf(stderr, "%s", s1);
  38. unless (p == 0) {
  39. int i;
  40. for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
  41. }
  42. unless (s2 == 0) fprintf(stderr, "%s", s2);
  43. fprintf(stderr, "\n");
  44. t->error_count++;
  45. }
  46. static void error1(struct tokeniser * t, char * s) {
  47. error(t, s, 0,0, 0);
  48. }
  49. static void error2(struct tokeniser * t, char * s) {
  50. error(t, "unexpected end of text after ", 0,0, s);
  51. }
  52. static int compare_words(int m, symbol * p, int n, const byte * q) {
  53. unless (m == n) return m - n;
  54. {
  55. int i; for (i = 0; i < n; i++) {
  56. int diff = p[i] - q[i];
  57. unless (diff == 0) return diff;
  58. }
  59. }
  60. return 0;
  61. }
  62. static int find_word(int n, symbol * p) {
  63. int i = 0; int j = vocab->code;
  64. repeat {
  65. int k = i + (j - i)/2;
  66. const struct system_word * w = vocab + k;
  67. int diff = compare_words(n, p, w->s_size, w->s);
  68. if (diff == 0) return w->code;
  69. if (diff < 0) j = k; else i = k;
  70. if (j - i == 1) break;
  71. }
  72. return -1;
  73. }
  74. static int get_number(int n, symbol * p) {
  75. int x = 0;
  76. int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
  77. return x;
  78. }
  79. static int eq_s(struct tokeniser * t, char * s) {
  80. int l = strlen(s);
  81. if (SIZE(t->p) - t->c < l) return false;
  82. {
  83. int i;
  84. for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
  85. }
  86. t->c += l; return true;
  87. }
  88. static int white_space(struct tokeniser * t, int ch) {
  89. switch (ch) {
  90. case '\n': t->line_number++;
  91. case '\r':
  92. case '\t':
  93. case ' ': return true;
  94. }
  95. return false;
  96. }
  97. static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
  98. struct m_pair * q = t->m_pairs;
  99. repeat {
  100. if (q == 0) return 0;
  101. {
  102. symbol * name = q->name;
  103. if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
  104. }
  105. q = q->next;
  106. }
  107. }
  108. static int read_literal_string(struct tokeniser * t, int c) {
  109. symbol * p = t->p;
  110. int ch;
  111. SIZE(t->b) = 0;
  112. repeat {
  113. if (c >= SIZE(p)) { error2(t, "'"); return c; }
  114. ch = p[c];
  115. if (ch == '\n') { error1(t, "string not terminated"); return c; }
  116. c++;
  117. if (ch == t->m_start) {
  118. int c0 = c;
  119. int newlines = false; /* no newlines as yet */
  120. int black_found = false; /* no printing chars as yet */
  121. repeat {
  122. if (c >= SIZE(p)) { error2(t, "'"); return c; }
  123. ch = p[c]; c++;
  124. if (ch == t->m_end) break;
  125. unless (white_space(t, ch)) black_found = true;
  126. if (ch == '\n') newlines = true;
  127. if (newlines && black_found) {
  128. error1(t, "string not terminated");
  129. return c;
  130. }
  131. }
  132. unless (newlines) {
  133. int n = c - c0 - 1; /* macro size */
  134. int firstch = p[c0];
  135. symbol * q = find_in_m(t, n, p + c0);
  136. if (q == 0) {
  137. if (n == 1 && (firstch == '\'' || firstch == t->m_start))
  138. t->b = add_to_b(t->b, 1, p + c0);
  139. else
  140. error(t, "string macro '", n, p + c0, "' undeclared");
  141. } else
  142. t->b = add_to_b(t->b, SIZE(q), q);
  143. }
  144. } else {
  145. if (ch == '\'') return c;
  146. t->b = add_to_b(t->b, 1, p + c - 1);
  147. }
  148. }
  149. }
  150. static int next_token(struct tokeniser * t) {
  151. symbol * p = t->p;
  152. int c = t->c;
  153. int ch;
  154. int code = -1;
  155. repeat {
  156. if (c >= SIZE(p)) { t->c = c; return -1; }
  157. ch = p[c];
  158. if (white_space(t, ch)) { c++; continue; }
  159. if (isalpha(ch)) {
  160. int c0 = c;
  161. while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
  162. code = find_word(c - c0, p + c0);
  163. if (code < 0) {
  164. t->b = move_to_b(t->b, c - c0, p + c0);
  165. code = c_name;
  166. }
  167. } else
  168. if (isdigit(ch)) {
  169. int c0 = c;
  170. while (c < SIZE(p) && isdigit(p[c])) c++;
  171. t->number = get_number(c - c0, p + c0);
  172. code = c_number;
  173. } else
  174. if (ch == '\'') {
  175. c = read_literal_string(t, c + 1);
  176. code = c_literalstring;
  177. } else
  178. {
  179. int lim = smaller(2, SIZE(p) - c);
  180. int i;
  181. for (i = lim; i > 0; i--) {
  182. code = find_word(i, p + c);
  183. if (code >= 0) { c += i; break; }
  184. }
  185. }
  186. if (code >= 0) {
  187. t->c = c;
  188. return code;
  189. }
  190. error(t, "'", 1, p + c, "' unknown");
  191. c++;
  192. continue;
  193. }
  194. }
  195. static int next_char(struct tokeniser * t) {
  196. if (t->c >= SIZE(t->p)) return -1;
  197. return t->p[t->c++];
  198. }
  199. static int next_real_char(struct tokeniser * t) {
  200. repeat {
  201. int ch = next_char(t);
  202. if (white_space(t, ch)) continue;
  203. return ch;
  204. }
  205. }
  206. static void read_chars(struct tokeniser * t) {
  207. int ch = next_real_char(t);
  208. if (ch < 0) { error2(t, "stringdef"); return; }
  209. {
  210. int c0 = t->c-1;
  211. repeat {
  212. ch = next_char(t);
  213. if (white_space(t, ch) || ch < 0) break;
  214. }
  215. t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);
  216. }
  217. }
  218. static int decimal_to_num(int ch) {
  219. if ('0' <= ch && ch <= '9') return ch - '0';
  220. return -1;
  221. }
  222. static int hex_to_num(int ch) {
  223. if ('0' <= ch && ch <= '9') return ch - '0';
  224. if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
  225. return -1;
  226. }
  227. static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
  228. int c = 0; int d = 0;
  229. repeat {
  230. while (c < SIZE(p) && p[c] == ' ') c++;
  231. if (c == SIZE(p)) break;
  232. {
  233. int number = 0;
  234. repeat {
  235. int ch = p[c];
  236. if (c == SIZE(p) || ch == ' ') break;
  237. if (base == 10) {
  238. ch = decimal_to_num(ch);
  239. if (ch < 0) {
  240. error1(t, "decimal string contains non-digits");
  241. return;
  242. }
  243. } else {
  244. ch = hex_to_num(tolower(ch));
  245. if (ch < 0) {
  246. error1(t, "hex string contains non-hex characters");
  247. return;
  248. }
  249. }
  250. number = base * number + ch;
  251. c++;
  252. }
  253. if (t->widechars || t->utf8) {
  254. unless (0 <= number && number <= 0xffff) {
  255. error1(t, "character values exceed 64K");
  256. return;
  257. }
  258. } else {
  259. unless (0 <= number && number <= 0xff) {
  260. error1(t, "character values exceed 256");
  261. return;
  262. }
  263. }
  264. if (t->utf8)
  265. d += put_utf8(number, p + d);
  266. else
  267. p[d++] = number;
  268. }
  269. }
  270. SIZE(p) = d;
  271. }
  272. extern int read_token(struct tokeniser * t) {
  273. symbol * p = t->p;
  274. int held = t->token_held;
  275. t->token_held = false;
  276. if (held) return t->token;
  277. repeat {
  278. int code = next_token(t);
  279. switch (code) {
  280. case c_comment1: /* slash-slash comment */
  281. while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
  282. continue;
  283. case c_comment2: /* slash-star comment */
  284. repeat {
  285. if (t->c >= SIZE(p)) {
  286. error1(t, "/* comment not terminated");
  287. t->token = -1;
  288. return -1;
  289. }
  290. if (p[t->c] == '\n') t->line_number++;
  291. if (eq_s(t, "*/")) break;
  292. t->c++;
  293. }
  294. continue;
  295. case c_stringescapes:
  296. {
  297. int ch1 = next_real_char(t);
  298. int ch2 = next_real_char(t);
  299. if (ch2 < 0)
  300. { error2(t, "stringescapes"); continue; }
  301. if (ch1 == '\'')
  302. { error1(t, "first stringescape cannot be '"); continue; }
  303. t->m_start = ch1;
  304. t->m_end = ch2;
  305. }
  306. continue;
  307. case c_stringdef:
  308. {
  309. int base = 0;
  310. read_chars(t);
  311. code = read_token(t);
  312. if (code == c_hex) { base = 16; code = read_token(t); } else
  313. if (code == c_decimal) { base = 10; code = read_token(t); }
  314. unless (code == c_literalstring)
  315. { error1(t, "string omitted after stringdef"); continue; }
  316. if (base > 0) convert_numeric_string(t, t->b, base);
  317. { NEW(m_pair, q);
  318. q->next = t->m_pairs;
  319. q->name = copy_b(t->b2);
  320. q->value = copy_b(t->b);
  321. t->m_pairs = q;
  322. }
  323. }
  324. continue;
  325. case c_get:
  326. code = read_token(t);
  327. unless (code == c_literalstring) {
  328. error1(t, "string omitted after get"); continue;
  329. }
  330. t->get_depth++;
  331. if (t->get_depth > 10) {
  332. fprintf(stderr, "get directives go 10 deep. Looping?\n");
  333. exit(1);
  334. }
  335. {
  336. char * file;
  337. NEW(input, q);
  338. symbol * u = get_input(t->b, &file);
  339. if (u == 0) {
  340. struct include * r = t->includes;
  341. until (r == 0) {
  342. symbol * b = copy_b(r->b);
  343. b = add_to_b(b, SIZE(t->b), t->b);
  344. u = get_input(b, &file);
  345. lose_b(b);
  346. unless (u == 0) break;
  347. r = r->next;
  348. }
  349. }
  350. if (u == 0) {
  351. error(t, "Can't get '", SIZE(t->b), t->b, "'");
  352. exit(1);
  353. }
  354. memmove(q, t, sizeof(struct input));
  355. t->next = q;
  356. t->p = u;
  357. t->c = 0;
  358. t->file = file;
  359. t->line_number = 1;
  360. }
  361. p = t->p;
  362. continue;
  363. case -1:
  364. unless (t->next == 0) {
  365. lose_b(p);
  366. {
  367. struct input * q = t->next;
  368. memmove(t, q, sizeof(struct input)); p = t->p;
  369. FREE(q);
  370. }
  371. t->get_depth--;
  372. continue;
  373. }
  374. /* drop through */
  375. default:
  376. t->previous_token = t->token;
  377. t->token = code;
  378. return code;
  379. }
  380. }
  381. }
  382. extern const char * name_of_token(int code) {
  383. int i;
  384. for (i = 1; i < vocab->code; i++)
  385. if ((vocab + i)->code == code) return (const char *)(vocab + i)->s;
  386. switch (code) {
  387. case c_mathassign: return "=";
  388. case c_name: return "name";
  389. case c_number: return "number";
  390. case c_literalstring:return "literal";
  391. case c_neg: return "neg";
  392. case c_grouping: return "grouping";
  393. case c_call: return "call";
  394. case c_booltest: return "Boolean test";
  395. case -2: return "start of text";
  396. case -1: return "end of text";
  397. default: return "?";
  398. }
  399. }
  400. extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
  401. NEW(tokeniser, t);
  402. t->next = 0;
  403. t->p = p;
  404. t->c = 0;
  405. t->file = file;
  406. t->line_number = 1;
  407. t->b = create_b(0);
  408. t->b2 = create_b(0);
  409. t->m_start = -1;
  410. t->m_pairs = 0;
  411. t->get_depth = 0;
  412. t->error_count = 0;
  413. t->token_held = false;
  414. t->token = -2;
  415. t->previous_token = -2;
  416. return t;
  417. }
  418. extern void close_tokeniser(struct tokeniser * t) {
  419. lose_b(t->b);
  420. lose_b(t->b2);
  421. {
  422. struct m_pair * q = t->m_pairs;
  423. until (q == 0) {
  424. struct m_pair * q_next = q->next;
  425. lose_b(q->name);
  426. lose_b(q->value);
  427. FREE(q);
  428. q = q_next;
  429. }
  430. }
  431. {
  432. struct input * q = t->next;
  433. until (q == 0) {
  434. struct input * q_next = q->next;
  435. FREE(q);
  436. q = q_next;
  437. }
  438. }
  439. free(t->file);
  440. FREE(t);
  441. }