1 files changed, 470 insertions, 0 deletions
diff --git a/contrib/snowball/compiler/tokeniser.c b/contrib/snowball/compiler/tokeniser.c
new file mode 100644
index 000000000..3dae5f744
--- /dev/null
+++ b/contrib/snowball/compiler/tokeniser.c
@@ -0,0 +1,470 @@
+
+#include <stdio.h>   /* stderr etc */
+#include <stdlib.h>  /* malloc free */
+#include <string.h>  /* strlen */
+#include <ctype.h>   /* isalpha etc */
+#include "header.h"
+
+struct system_word {
+    int s_size;      /* size of system word */
+    const byte * s;  /* pointer to the system word */
+    int code;        /* its internal code */
+};
+
+
+/* ASCII collating assumed in syswords.c */
+
+#include "syswords.h"
+
+static int smaller(int a, int b) { return a < b ? a : b; }
+
+extern symbol * get_input(symbol * p, char ** p_file) {
+
+    char * s = b_to_s(p);
+    {
+        FILE * input = fopen(s, "r");
+        if (input == 0) { free(s); return 0; }
+        *p_file = s;
+        {
+            symbol * u = create_b(STARTSIZE);
+            int size = 0;
+            repeat
+            {   int ch = getc(input);
+                if (ch == EOF) break;
+                if (size >= CAPACITY(u)) u = increase_capacity(u, size/2);
+                u[size++] = ch;
+            }
+            fclose(input);
+            SIZE(u) = size; return u;
+        }
+    }
+}
+
+static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) {
+    if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
+    fprintf(stderr, "%s:%d: ", t->file, t->line_number);
+    unless (s1 == 0) fprintf(stderr, "%s", s1);
+    unless (p == 0) {
+        int i;
+        for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
+    }
+    unless (s2 == 0) fprintf(stderr, "%s", s2);
+    fprintf(stderr, "\n");
+    t->error_count++;
+}
+
+static void error1(struct tokeniser * t, char * s) {
+    error(t, s, 0,0, 0);
+}
+
+static void error2(struct tokeniser * t, char * s) {
+    error(t, "unexpected end of text after ", 0,0, s);
+}
+
+static int compare_words(int m, symbol * p, int n, const byte * q) {
+    unless (m == n) return m - n;
+    {
+        int i; for (i = 0; i < n; i++) {
+            int diff = p[i] - q[i];
+            unless (diff == 0) return diff;
+        }
+    }
+    return 0;
+}
+
+static int find_word(int n, symbol * p) {
+    int i = 0; int j = vocab->code;
+    repeat {
+        int k = i + (j - i)/2;
+        const struct system_word * w = vocab + k;
+        int diff = compare_words(n, p, w->s_size, w->s);
+        if (diff == 0) return w->code;
+        if (diff < 0) j = k; else i = k;
+        if (j - i == 1) break;
+    }
+    return -1;
+}
+
+static int get_number(int n, symbol * p) {
+    int x = 0;
+    int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
+    return x;
+}
+
+static int eq_s(struct tokeniser * t, char * s) {
+    int l = strlen(s);
+    if (SIZE(t->p) - t->c < l) return false;
+    {
+        int i;
+        for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
+    }
+    t->c += l; return true;
+}
+
+static int white_space(struct tokeniser * t, int ch) {
+    switch (ch) {
+        case '\n': t->line_number++;
+        case '\r':
+        case '\t':
+        case ' ': return true;
+    }
+    return false;
+}
+
+static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
+    struct m_pair * q = t->m_pairs;
+    repeat {
+        if (q == 0) return 0;
+        {
+            symbol * name = q->name;
+            if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
+        }
+        q = q->next;
+    }
+}
+
+static int read_literal_string(struct tokeniser * t, int c) {
+    symbol * p = t->p;
+    int ch;
+    SIZE(t->b) = 0;
+    repeat {
+        if (c >= SIZE(p)) { error2(t, "'"); return c; }
+        ch = p[c];
+        if (ch == '\n') { error1(t, "string not terminated"); return c; }
+        c++;
+        if (ch == t->m_start) {
+            int c0 = c;
+            int newlines = false; /* no newlines as yet */
+            int black_found = false; /* no printing chars as yet */
+            repeat {
+                if (c >= SIZE(p)) { error2(t, "'"); return c; }
+                ch = p[c]; c++;
+                if (ch == t->m_end) break;
+                unless (white_space(t, ch)) black_found = true;
+                if (ch == '\n') newlines = true;
+                if (newlines && black_found) {
+                    error1(t, "string not terminated");
+                    return c;
+                }
+            }
+            unless (newlines) {
+                int n = c - c0 - 1;    /* macro size */
+                int firstch = p[c0];
+                symbol * q = find_in_m(t, n, p + c0);
+                if (q == 0) {
+                    if (n == 1 && (firstch == '\'' || firstch == t->m_start))
+                        t->b = add_to_b(t->b, 1, p + c0);
+                    else
+                        error(t, "string macro '", n, p + c0, "' undeclared");
+                } else
+                    t->b = add_to_b(t->b, SIZE(q), q);
+            }
+        } else {
+            if (ch == '\'') return c;
+            t->b = add_to_b(t->b, 1, p + c - 1);
+        }
+    }
+}
+
+static int next_token(struct tokeniser * t) {
+    symbol * p = t->p;
+    int c = t->c;
+    int ch;
+    int code = -1;
+    repeat {
+        if (c >= SIZE(p)) { t->c = c; return -1; }
+        ch = p[c];
+        if (white_space(t, ch)) { c++; continue; }
+        if (isalpha(ch)) {
+            int c0 = c;
+            while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
+            code = find_word(c - c0, p + c0);
+            if (code < 0) {
+                t->b = move_to_b(t->b, c - c0, p + c0);
+                code = c_name;
+            }
+        } else
+        if (isdigit(ch)) {
+            int c0 = c;
+            while (c < SIZE(p) && isdigit(p[c])) c++;
+            t->number = get_number(c - c0, p + c0);
+            code = c_number;
+        } else
+        if (ch == '\'') {
+            c = read_literal_string(t, c + 1);
+            code = c_literalstring;
+        } else
+        {
+            int lim = smaller(2, SIZE(p) - c);
+            int i;
+            for (i = lim; i > 0; i--) {
+                code = find_word(i, p + c);
+                if (code >= 0) { c += i; break; }
+            }
+        }
+        if (code >= 0) {
+            t->c = c;
+            return code;
+        }
+        error(t, "'", 1, p + c, "' unknown");
+        c++;
+        continue;
+    }
+}
+
+static int next_char(struct tokeniser * t) {
+    if (t->c >= SIZE(t->p)) return -1;
+    return t->p[t->c++];
+}
+
+static int next_real_char(struct tokeniser * t) {
+    repeat {
+        int ch = next_char(t);
+        if (white_space(t, ch)) continue;
+        return ch;
+    }
+}
+
+static void read_chars(struct tokeniser * t) {
+    int ch = next_real_char(t);
+    if (ch < 0) { error2(t, "stringdef"); return; }
+    {
+        int c0 = t->c-1;
+        repeat {
+            ch = next_char(t);
+            if (white_space(t, ch) || ch < 0) break;
+        }
+        t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);
+    }
+}
+
+static int decimal_to_num(int ch) {
+    if ('0' <= ch && ch <= '9') return ch - '0';
+    return -1;
+}
+
+static int hex_to_num(int ch) {
+    if ('0' <= ch && ch <= '9') return ch - '0';
+    if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
+    return -1;
+}
+
+static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
+    int c = 0; int d = 0;
+    repeat {
+        while (c < SIZE(p) && p[c] == ' ') c++;
+        if (c == SIZE(p)) break;
+        {
+            int number = 0;
+            repeat {
+                int ch = p[c];
+                if (c == SIZE(p) || ch == ' ') break;
+                if (base == 10) {
+                    ch = decimal_to_num(ch);
+                    if (ch < 0) {
+                        error1(t, "decimal string contains non-digits");
+                        return;
+                    }
+                } else {
+                    ch = hex_to_num(tolower(ch));
+                    if (ch < 0) {
+                        error1(t, "hex string contains non-hex characters");
+                        return;
+                    }
+                }
+                number = base * number + ch;
+                c++;
+            }
+            if (t->widechars || t->utf8) {
+                unless (0 <= number && number <= 0xffff) {
+                    error1(t, "character values exceed 64K");
+                    return;
+                }
+            } else {
+                unless (0 <= number && number <= 0xff) {
+                    error1(t, "character values exceed 256");
+                    return;
+                }
+            }
+            if (t->utf8)
+                d += put_utf8(number, p + d);
+            else
+                p[d++] = number;
+        }
+    }
+    SIZE(p) = d;
+}
+
+extern int read_token(struct tokeniser * t) {
+    symbol * p = t->p;
+    int held = t->token_held;
+    t->token_held = false;
+    if (held) return t->token;
+    repeat {
+        int code = next_token(t);
+        switch (code) {
+            case c_comment1: /*  slash-slash comment */
+               while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
+               continue;
+            case c_comment2: /* slash-star comment */
+               repeat {
+                   if (t->c >= SIZE(p)) {
+                       error1(t, "/* comment not terminated");
+                       t->token = -1;
+                       return -1;
+                   }
+                   if (p[t->c] == '\n') t->line_number++;
+                   if (eq_s(t, "*/")) break;
+                   t->c++;
+               }
+               continue;
+            case c_stringescapes:
+               {
+                   int ch1 = next_real_char(t);
+                   int ch2 = next_real_char(t);
+                   if (ch2 < 0)
+                       { error2(t, "stringescapes"); continue; }
+                   if (ch1 == '\'')
+                       { error1(t, "first stringescape cannot be '"); continue; }
+                   t->m_start = ch1;
+                   t->m_end = ch2;
+               }
+               continue;
+            case c_stringdef:
+               {
+                   int base = 0;
+                   read_chars(t);
+                   code = read_token(t);
+                   if (code == c_hex) { base = 16; code = read_token(t); } else
+                   if (code == c_decimal) { base = 10; code = read_token(t); }
+                   unless (code == c_literalstring)
+                       { error1(t, "string omitted after stringdef"); continue; }
+                   if (base > 0) convert_numeric_string(t, t->b, base);
+                   {   NEW(m_pair, q);
+                       q->next = t->m_pairs;
+                       q->name = copy_b(t->b2);
+                       q->value = copy_b(t->b);
+                       t->m_pairs = q;
+                   }
+               }
+               continue;
+            case c_get:
+               code = read_token(t);
+               unless (code == c_literalstring) {
+                   error1(t, "string omitted after get"); continue;
+               }
+               t->get_depth++;
+               if (t->get_depth > 10) {
+                   fprintf(stderr, "get directives go 10 deep. Looping?\n");
+                   exit(1);
+               }
+               {
+                   char * file;
+                   NEW(input, q);
+                   symbol * u = get_input(t->b, &file);
+                   if (u == 0) {
+                       struct include * r = t->includes;
+                       until (r == 0) {
+                           symbol * b = copy_b(r->b);
+                           b = add_to_b(b, SIZE(t->b), t->b);
+                           u = get_input(b, &file);
+                           lose_b(b);
+                           unless (u == 0) break;
+                           r = r->next;
+                       }
+                   }
+                   if (u == 0) {
+                       error(t, "Can't get '", SIZE(t->b), t->b, "'");
+                       exit(1);
+                   }
+                   memmove(q, t, sizeof(struct input));
+                   t->next = q;
+                   t->p = u;
+                   t->c = 0;
+                   t->file = file;
+                   t->line_number = 1;
+               }
+               p = t->p;
+               continue;
+            case -1:
+               unless (t->next == 0) {
+                   lose_b(p);
+                   {
+                       struct input * q = t->next;
+                       memmove(t, q, sizeof(struct input)); p = t->p;
+                       FREE(q);
+                   }
+                   t->get_depth--;
+                   continue;
+               }
+               /* drop through */
+            default:
+                t->previous_token = t->token;
+                t->token = code;
+                return code;
+        }
+    }
+}
+
+extern const char * name_of_token(int code) {
+    int i;
+    for (i = 1; i < vocab->code; i++)
+        if ((vocab + i)->code == code) return (const char *)(vocab + i)->s;
+    switch (code) {
+        case c_mathassign:   return "=";
+        case c_name:         return "name";
+        case c_number:       return "number";
+        case c_literalstring:return "literal";
+        case c_neg:          return "neg";
+        case c_grouping:     return "grouping";
+        case c_call:         return "call";
+        case c_booltest:     return "Boolean test";
+        case -2:             return "start of text";
+        case -1:             return "end of text";
+        default:             return "?";
+    }
+}
+
+extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
+    NEW(tokeniser, t);
+    t->next = 0;
+    t->p = p;
+    t->c = 0;
+    t->file = file;
+    t->line_number = 1;
+    t->b = create_b(0);
+    t->b2 = create_b(0);
+    t->m_start = -1;
+    t->m_pairs = 0;
+    t->get_depth = 0;
+    t->error_count = 0;
+    t->token_held = false;
+    t->token = -2;
+    t->previous_token = -2;
+    return t;
+}
+
+extern void close_tokeniser(struct tokeniser * t) {
+    lose_b(t->b);
+    lose_b(t->b2);
+    {
+        struct m_pair * q = t->m_pairs;
+        until (q == 0) {
+            struct m_pair * q_next = q->next;
+            lose_b(q->name);
+            lose_b(q->value);
+            FREE(q);
+            q = q_next;
+        }
+    }
+    {
+        struct input * q = t->next;
+        until (q == 0) {
+            struct input * q_next = q->next;
+            FREE(q);
+            q = q_next;
+        }
+    }
+    free(t->file);
+    FREE(t);
+}