Implement strings parser.

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 6 Aug 2013 16:31:21 +0000 (17:31 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 6 Aug 2013 16:31:21 +0000 (17:31 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 6 Aug 2013 16:31:21 +0000 (17:31 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 6 Aug 2013 16:31:21 +0000 (17:31 +0100)
diff --git a/src/rcl/rcl.h b/src/rcl/rcl.h

index 76f9523fb797defb7be7632ac269a23296ae2153..54c3c81e54971d80a1fc845658e28981361098a8 100644 (file)
--- a/src/rcl/rcl.h
+++ b/src/rcl/rcl.h
@@ -38,7 +38,8 @@ enum rspamd_cl_error {
         RSPAMD_CL_EOK = 0,
         RSPAMD_CL_ESYNTAX,
         RSPAMD_CL_EIO,
-       RSPAMD_CL_ESTATE
+       RSPAMD_CL_ESTATE,
+       RSPAMD_CL_ENESTED
  };
  
  enum rspamd_cl_type {
@@ -57,10 +58,12 @@ enum rspamd_cl_emitter {
  };
  
  typedef struct rspamd_cl_object_s {
+       gchar *key;                                                             /**< the key of an object */
         union {
                 gint64 iv;                                                      /**< int value of an object */
                 gchar *sv;                                                      /**< string value of an object */
                 gdouble dv;                                                     /**< double value of an object */
+               struct rspamd_cl_object_s *ov;          /**< array or hash                      */
         } value;
         enum rspamd_cl_type type;                               /**< real type                          */
         struct rspamd_cl_object_s *next;                /**< array handle                       */
diff --git a/src/rcl/rcl_internal.h b/src/rcl/rcl_internal.h

index 48047b19f43845ee83264cdec9375fcea39eb958..33256119ddf1e1444225aa4aee62f4544aa657b1 100644 (file)
--- a/src/rcl/rcl_internal.h
+++ b/src/rcl/rcl_internal.h
@@ -64,16 +64,24 @@ struct rspamd_cl_stack {
         struct rspamd_cl_stack *next;
  };
  
+struct rspamd_cl_chunk {
+       const guchar *begin;
+       const guchar *end;
+       const guchar *pos;
+       gsize remain;
+       guint line;
+       guint column;
+       struct rspamd_cl_chunk *next;
+};
+
  struct rspamd_cl_parser {
         enum rspamd_cl_parser_state state;
         enum rspamd_cl_parser_state prev_state;
-       gint comments_nested;
         rspamd_cl_object_t *top_obj;
         rspamd_cl_object_t *cur_obj;
         struct rspamd_cl_macro *macroes;
         struct rspamd_cl_stack *stack;
-       guint line;
-       guint column;
+       struct rspamd_cl_chunk *chunks;
  };
  
  #endif /* RCL_INTERNAL_H_ */
diff --git a/src/rcl/rcl_parser.c b/src/rcl/rcl_parser.c

index 8ace95cf03dda185022cc85db3c537d1dfffe2f7..f727991de1d13f71bb3a1bfc3fc2cec5c5a3d5ec 100644 (file)
--- a/src/rcl/rcl_parser.c
+++ b/src/rcl/rcl_parser.c
@@ -24,6 +24,7 @@
  #include "config.h"
  #include "rcl.h"
  #include "rcl_internal.h"
+#include "util.h"
  
  /**
   * @file rcl_parser.c
@@ -47,67 +48,93 @@ rspamd_cl_object_new (void)
   * @param len
   * @return new position in chunk
   */
-static inline const guchar *
-rspamd_cl_chunk_getc (struct rspamd_cl_parser *parser, const guchar *begin, gsize len)
+static inline void
+rspamd_cl_chunk_skipc (struct rspamd_cl_chunk *chunk, guchar c)
  {
-       while (len > 0) {
-               len --;
-               if (*begin == '\n') {
-                       parser->line ++;
-                       parser->column = 0;
-               }
-               else {
-                       parser->column ++;
-               }
-               begin ++;
+       if (c == '\n') {
+               chunk->line ++;
+               chunk->column = 0;
+       }
+       else {
+               chunk->column ++;
         }
-       return begin;
+
+       chunk->pos ++;
+       chunk->remain --;
+}
+
+static inline void
+rspamd_cl_set_err (struct rspamd_cl_chunk *chunk, gint code, const char *str, GError **err)
+{
+       g_set_error (err, RCL_ERROR, code, "Error detected on line %d at pos %d: '%s'",
+                       chunk->line, chunk->column, str);
  }
  
  static gboolean
-rspamd_cl_check_open_comment (struct rspamd_cl_parser *parser, const guchar **begin, gsize *len)
+rspamd_cl_skip_comments (struct rspamd_cl_parser *parser, GError **err)
  {
-       const guchar *p = *begin;
+       struct rspamd_cl_chunk *chunk = parser->chunks;
+       const guchar *p;
+       gint comments_nested = 0;
+
+       p = chunk->pos;
  
         if (*p == '#') {
                 if (parser->state != RSPAMD_RCL_STATE_SCOMMENT &&
                                 parser->state != RSPAMD_RCL_STATE_MCOMMENT) {
-                       parser->prev_state = parser->state;
-                       parser->state = RSPAMD_RCL_STATE_SCOMMENT;
-                       *begin = rspamd_cl_chunk_getc (parser, *begin, 1);
-                       (*len) --;
-                       return TRUE;
+                       while (p < chunk->end) {
+                               if (*p == '\n') {
+                                       rspamd_cl_chunk_skipc (chunk, *p);
+                                       break;
+                               }
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
+                       }
                 }
         }
-       else if (*p == '/' && *len >= 2) {
+       else if (*p == '/' && chunk->remain >= 2) {
+               p ++;
                 if (*p == '/' && parser->state != RSPAMD_RCL_STATE_SCOMMENT &&
                                 parser->state != RSPAMD_RCL_STATE_MCOMMENT) {
-                       parser->prev_state = parser->state;
-                       parser->state = RSPAMD_RCL_STATE_SCOMMENT;
-                       *begin = rspamd_cl_chunk_getc (parser, *begin, 2);
-                       (*len) -= 2;
-                       return TRUE;
+                       chunk->pos = p;
+                       while (p < chunk->end) {
+                               if (*p == '\n') {
+                                       rspamd_cl_chunk_skipc (chunk, *p);
+                                       break;
+                               }
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
+                       }
                 }
                 else if (*p == '*') {
-                       /* Multiline comment */
-                       if (parser->state == RSPAMD_RCL_STATE_SCOMMENT) {
-                               /* Immediately finish single line comment and start multiline one */
-                               parser->state = RSPAMD_RCL_STATE_MCOMMENT;
-                               parser->comments_nested ++;
-                       }
-                       else if (parser->state == RSPAMD_RCL_STATE_MCOMMENT) {
-                               parser->comments_nested ++;
+                       comments_nested ++;
+                       chunk->pos = p;
+
+                       while (p < chunk->end) {
+                               if (*p == '*') {
+                                       rspamd_cl_chunk_skipc (chunk, *p);
+                                       p ++;
+                                       rspamd_cl_chunk_skipc (chunk, *p);
+                                       if (*p == '/') {
+                                               comments_nested --;
+                                               if (comments_nested == 0) {
+                                                       break;
+                                               }
+                                       }
+                                       p ++;
+                                       rspamd_cl_chunk_skipc (chunk, *p);
+                               }
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
                         }
-                       else {
-                               parser->prev_state = parser->state;
-                               parser->state = RSPAMD_RCL_STATE_SCOMMENT;
+                       if (comments_nested != 0) {
+                               rspamd_cl_set_err (chunk, RSPAMD_CL_ENESTED, "comments nesting is invalid", err);
+                               return FALSE;
                         }
-                       *begin = rspamd_cl_chunk_getc (parser, *begin, 2);
-                       (*len) -= 2;
                 }
         }
  
-       return FALSE;
+       return TRUE;
  }
  
  /**
@@ -138,91 +165,304 @@ rspamd_cl_includes_handler (const guchar *data, gsize len, gpointer ud, GError *
         return TRUE;
  }
  
-static const guchar *
-rspamd_cl_skip_spaces (struct rspamd_cl_parser *parser, const guchar *data, gsize *len)
+/**
+ * Parse quoted string with possible escapes
+ * @param parser
+ * @param chunk
+ * @param err
+ * @return TRUE if a string has been parsed
+ */
+static gboolean
+rspamd_cl_lex_json_string (struct rspamd_cl_parser *parser,
+               struct rspamd_cl_chunk *chunk, GError **err)
  {
-       const guchar *p, *end;
-
-       p = data;
-       end = data + *len;
+       const guchar *p = chunk->pos;
+       guchar c;
+       gint i;
  
-       if (parser->state == RSPAMD_RCL_STATE_KEY) {
-               /* Skip any space character */
-               while (p < end) {
-                       if (!g_ascii_isspace (*p)) {
-                               break;
+       while (p < chunk->end) {
+               c = *p;
+               if (c < 0x1F) {
+                       /* Unmasked control character */
+                       if (c == '\n') {
+                               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unexpected newline", err);
                         }
-                       p = rspamd_cl_chunk_getc (parser, p, 1);
-                       (*len) --;
+                       else {
+                               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unexpected control character", err);
+                       }
+                       return FALSE;
+               }
+               if (c == '\\') {
+                       rspamd_cl_chunk_skipc (chunk, *p);
+                       p ++;
+                       c = *p;
+                       if (p >= chunk->end) {
+                               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unfinished escape character", err);
+                               return FALSE;
+                       }
+                       if (*p == 'u') {
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
+                               for (i = 0; i < 4 && p < chunk->end; i ++) {
+                                       if (!g_ascii_isxdigit (*p)) {
+                                               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "invalid utf escape", err);
+                                               return FALSE;
+                                       }
+                                       rspamd_cl_chunk_skipc (chunk, *p);
+                                       p ++;
+                               }
+                               if (p >= chunk->end) {
+                                       rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unfinished escape character", err);
+                                       return FALSE;
+                               }
+                       }
+                       else if (c == '"' || c == '\\' || c == '/' || c == 'b' ||
+                                       c == 'f' || c == 'n' || c == 'r' || c == 't') {
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
+                       }
+                       else {
+                               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "invalid escape character", err);
+                               return FALSE;
+                       }
+                       continue;
+               }
+               else if (c == '"') {
+                       return TRUE;
                 }
         }
-       else {
-               while (p < end) {
-                       if (!g_ascii_isspace (*p) || *p == '\n' || *p == '\r') {
+
+       return FALSE;
+}
+
+/**
+ * Unescape json string inplace
+ * @param str
+ */
+static void
+rspamd_cl_unescape_json_string (gchar *str)
+{
+       gchar *t = str, *h = str;
+       gint i, uval;
+
+       /* t is target (tortoise), h is source (hare) */
+
+       while (*h != '\0') {
+               if (*h == '\\') {
+                       h ++;
+                       switch (*h) {
+                       case 'n':
+                               *t++ = '\n';
+                               break;
+                       case 'r':
+                               *t++ = '\r';
+                               break;
+                       case 'b':
+                               *t++ = '\b';
+                               break;
+                       case 't':
+                               *t++ = '\t';
+                               break;
+                       case 'f':
+                               *t++ = '\f';
+                               break;
+                       case '\\':
+                               *t++ = '\\';
+                               break;
+                       case '"':
+                               *t++ = '"';
+                               break;
+                       case 'u':
+                               /* Unicode escape */
+                               uval = 0;
+                               for (i = 0; i < 4; i++) {
+                                       uval <<= 4;
+                                       if (g_ascii_isdigit (h[i])) {
+                                               uval += h[i] - '0';
+                                       }
+                                       else if (h[i] >= 'a' && h[i] <= 'f') {
+                                               uval += h[i] - 'a' + 10;
+                                       }
+                                       else if (h[i] >= 'A' && h[i] <= 'F') {
+                                               uval += h[i] - 'A' + 10;
+                                       }
+                               }
+                               /* Encode */
+                               if(uval < 0x80) {
+                                       t[0] = (char)uval;
+                                       t ++;
+                               }
+                               else if(uval < 0x800) {
+                                       t[0] = 0xC0 + ((uval & 0x7C0) >> 6);
+                                       t[1] = 0x80 + ((uval & 0x03F));
+                                       t += 2;
+                               }
+                               else if(uval < 0x10000) {
+                                       t[0] = 0xE0 + ((uval & 0xF000) >> 12);
+                                       t[1] = 0x80 + ((uval & 0x0FC0) >> 6);
+                                       t[2] = 0x80 + ((uval & 0x003F));
+                                       t += 3;
+                               }
+                               else if(uval <= 0x10FFFF) {
+                                       t[0] = 0xF0 + ((uval & 0x1C0000) >> 18);
+                                       t[1] = 0x80 + ((uval & 0x03F000) >> 12);
+                                       t[2] = 0x80 + ((uval & 0x000FC0) >> 6);
+                                       t[3] = 0x80 + ((uval & 0x00003F));
+                                       t += 4;
+                               }
+                               else {
+                                       *t++ = '?';
+                               }
+                               break;
+                       default:
+                               *t++ = '?';
                                 break;
                         }
-                       p = rspamd_cl_chunk_getc (parser, p, 1);
-                       (*len) --;
+               }
+               else {
+                       *t++ = *h++;
                 }
         }
-
-       return p;
  }
  
+/**
+ * Parse a key in an object
+ * @param parser
+ * @param chunk
+ * @param err
+ * @return TRUE if a key has been parsed
+ */
  static gboolean
-rspamd_cl_parse_key (struct rspamd_cl_parser *parser, const guchar **data,
-               gsize *len, GError **err)
+rspamd_cl_parse_key (struct rspamd_cl_parser *parser,
+               struct rspamd_cl_chunk *chunk, GError **err)
  {
         const guchar *p, *c = NULL, *end;
+       gboolean got_quote = FALSE, got_eq = FALSE, got_semicolon = FALSE;
+       rspamd_cl_object_t *nobj;
  
+       p = chunk->pos;
  
-       p = *data;
-       end = p + *len;
+       /* Skip any spaces */
+       while (p < chunk->end && g_ascii_isspace (*p)) {
+               rspamd_cl_chunk_skipc (chunk, *p);
+               p ++;
+       }
  
-       while (p < end) {
+       while (p < chunk->end) {
                 /*
                  * A key must start with alpha and end with space character
                  */
                 if (*p == '.') {
                         /* It is macro actually */
-                       p = rspamd_cl_chunk_getc (parser, p, 1);
-                       len --;
+                       rspamd_cl_chunk_skipc (chunk, *p);
                         parser->state = RSPAMD_RCL_STATE_MACRO_NAME;
-                       *data = p;
                         return TRUE;
                 }
                 else if (c == NULL) {
                         if (g_ascii_isalpha (*p)) {
                                 /* The first symbol */
                                 c = p;
-                               p = rspamd_cl_chunk_getc (parser, p, 1);
-                               (*len) --;
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
                         }
                         else if (*p == '"') {
                                 /* JSON style key */
                                 c = p + 1;
-                               p = rspamd_cl_chunk_getc (parser, p, 2);
-                               (*len) -= 2;
+                               got_quote = TRUE;
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
                         }
                         else {
                                 /* Invalid identifier */
                                 parser->state = RSPAMD_RCL_STATE_ERROR;
-                               g_set_error (err, RCL_ERROR, RSPAMD_CL_ESYNTAX, "key must start with a letter, "
-                                               "line %d, pos: %d", parser->line, parser->column);
+                               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "key must begin with a letter", err);
                                 return FALSE;
                         }
                 }
                 else {
-                       if (g_ascii_isalnum (*p)) {
-                               p = rspamd_cl_chunk_getc (parser, p, 1);
-                               (*len) --;
+                       /* Parse the body of a key */
+                       if (!got_quote) {
+                               if (g_ascii_isalnum (*p)) {
+                                       rspamd_cl_chunk_skipc (chunk, *p);
+                                       p ++;
+                               }
+                               else if (*p == ' ' || *p == '\t' || *p == ':' || *p == '=') {
+                                       end = p;
+                                       break;
+                               }
+                               else {
+                                       rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "invalid character in a key", err);
+                                       return FALSE;
+                               }
                         }
-                       else if (*p == ' ' || *p == '\t') {
-                               p = rspamd_cl_skip_spaces (parser, p, len);
+                       else {
+                               /* We need to parse json like quoted string */
+                               if (!rspamd_cl_lex_json_string (parser, chunk, err)) {
+                                       return FALSE;
+                               }
+                               end = chunk->pos;
+                               p = end;
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
+                               break;
                         }
                 }
         }
-       *data = p;
+
+       if (p >= chunk->end) {
+               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unfinished key", err);
+               return FALSE;
+       }
+
+       /* We are now at the end of the key, need to parse the rest */
+       while (p < chunk->end) {
+               if (*p == ' ' || *p == '\t') {
+                       rspamd_cl_chunk_skipc (chunk, *p);
+                       p ++;
+               }
+               else if (*p == '=') {
+                       if (!got_eq && !got_semicolon) {
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
+                               got_eq = TRUE;
+                       }
+                       else {
+                               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unexpected '=' character", err);
+                               return FALSE;
+                       }
+               }
+               else if (*p == ':') {
+                       if (!got_eq && !got_semicolon) {
+                               rspamd_cl_chunk_skipc (chunk, *p);
+                               p ++;
+                               got_semicolon = TRUE;
+                       }
+                       else {
+                               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unexpected ':' character", err);
+                               return FALSE;
+                       }
+               }
+               else {
+                       /* Start value */
+                       break;
+               }
+       }
+
+       if (p >= chunk->end) {
+               rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unfinished key", err);
+               return FALSE;
+       }
+
+       /* Create a new object */
+       nobj = rspamd_cl_object_new ();
+       nobj->key = g_malloc (end - c + 1);
+       rspamd_strlcpy (nobj->key, c, end - c + 1);
+
+       if (got_quote) {
+               rspamd_cl_unescape_json_string (nobj->key);
+       }
+
+       HASH_ADD_KEYPTR (hh, parser->cur_obj->value.ov, nobj->key, strlen (nobj->key), nobj);
  
         return TRUE;
  }
@@ -236,15 +476,14 @@ rspamd_cl_parse_key (struct rspamd_cl_parser *parser, const guchar **data,
   * @return TRUE if chunk has been parsed and FALSE in case of error
   */
  static gboolean
-rspamd_cl_state_machine (struct rspamd_cl_parser *parser, const guchar *data,
-               gsize len, GError **err)
+rspamd_cl_state_machine (struct rspamd_cl_parser *parser, GError **err)
  {
-       const guchar *p, *end;
         rspamd_cl_object_t *obj;
+       struct rspamd_cl_chunk *chunk = parser->chunks;
+       const guchar *p;
  
-       p = data;
-       end = p + len;
-       while (p < end) {
+       p = chunk->pos;
+       while (chunk->pos < chunk->end) {
                 switch (parser->state) {
                 case RSPAMD_RCL_STATE_INIT:
                         /*
@@ -252,29 +491,33 @@ rspamd_cl_state_machine (struct rspamd_cl_parser *parser, const guchar *data,
                          * if we got [ or { correspondingly or can just treat new data as
                          * a key of newly created object
                          */
-                       if (!rspamd_cl_check_open_comment (parser, &p, &len)) {
+                       if (!rspamd_cl_skip_comments (parser, err)) {
+                               parser->state = RSPAMD_RCL_STATE_ERROR;
+                               return FALSE;
+                       }
+                       else {
                                 obj = rspamd_cl_object_new ();
                                 if (*p == '[') {
                                         parser->state = RSPAMD_RCL_STATE_ARRAY;
                                         obj->type = RSPAMD_CL_ARRAY;
-                                       p = rspamd_cl_chunk_getc (parser, p, 1);
-                                       len --;
+                                       rspamd_cl_chunk_skipc (chunk, *p);
+                                       p ++;
                                 }
                                 else {
                                         parser->state = RSPAMD_RCL_STATE_KEY;
                                         obj->type = RSPAMD_CL_OBJECT;
                                         if (*p == '{') {
-                                               p = rspamd_cl_chunk_getc (parser, p, 1);
-                                               len --;
+                                               rspamd_cl_chunk_skipc (chunk, *p);
+                                               p ++;
                                         }
                                 }
                                 parser->cur_obj = obj;
                                 parser->top_obj = obj;
-                               p = rspamd_cl_skip_spaces (parser, p, &len);
                         }
                         break;
                 case RSPAMD_RCL_STATE_KEY:
-                       if (!rspamd_cl_parse_key (parser, &p, &len, err)) {
+                       if (!rspamd_cl_parse_key (parser, chunk, err)) {
+                               parser->state = RSPAMD_RCL_STATE_ERROR;
                                 return FALSE;
                         }
                         break;
@@ -294,7 +537,6 @@ rspamd_cl_parser_new (void)
  
         new = g_slice_alloc0 (sizeof (struct rspamd_cl_parser));
  
-       new->line = 1;
         rspamd_cl_parser_register_macro (new, "include", rspamd_cl_include_handler, new);
         rspamd_cl_parser_register_macro (new, "includes", rspamd_cl_includes_handler, new);
  
@@ -319,8 +561,18 @@ gboolean
  rspamd_cl_parser_add_chunk (struct rspamd_cl_parser *parser, const guchar *data,
                 gsize len, GError **err)
  {
+       struct rspamd_cl_chunk *chunk;
+
         if (parser->state != RSPAMD_RCL_STATE_ERROR) {
-               return rspamd_cl_state_machine (parser, data, len, err);
+               chunk = g_slice_alloc (sizeof (struct rspamd_cl_chunk));
+               chunk->begin = data;
+               chunk->remain = len;
+               chunk->pos = chunk->begin;
+               chunk->end = chunk->begin + len;
+               chunk->line = 1;
+               chunk->column = 0;
+               LL_PREPEND (parser->chunks, chunk);
+               return rspamd_cl_state_machine (parser, err);
         }
  
         g_set_error (err, RCL_ERROR, RSPAMD_CL_ESTATE, "a parser is in an invalid state");
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 6 Aug 2013 16:31:21 +0000 (17:31 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 6 Aug 2013 16:31:21 +0000 (17:31 +0100)
src/rcl/rcl.h		patch \| blob \| history
src/rcl/rcl_internal.h		patch \| blob \| history
src/rcl/rcl_parser.c		patch \| blob \| history