From 63a7fdf83df3f43343ba4fdb878bdfc8ebf52204 Mon Sep 17 00:00:00 2001
From: Haijian Wang <haijian@vaadin.com>
Date: Fri, 4 Jan 2013 14:13:02 +0200
Subject: [PATCH] Preserve UTF-8 in imported files and output @charset in
 generated css (#10505)

Change-Id: I53f46611ef39124d532b118bb8ccb34f31cf8a6a
---
 .../vaadin/sass/internal/ScssStylesheet.java  |  44 ++++-
 .../vaadin/sass/internal/parser/Parser.java   | 185 +++++++++---------
 .../com/vaadin/sass/internal/parser/Parser.jj | 135 ++++++-------
 .../internal/visitor/ImportNodeHandler.java   |   5 +-
 .../tests/resources/automatic/css/utf8.css    |   5 +
 ...o-be-imported-scss-file-contains-utf8.scss |   3 +
 .../tests/resources/automatic/scss/utf8.scss  |   4 +
 7 files changed, 210 insertions(+), 171 deletions(-)
 create mode 100644 theme-compiler/tests/resources/automatic/css/utf8.css
 create mode 100644 theme-compiler/tests/resources/automatic/scss/utf8-imported/to-be-imported-scss-file-contains-utf8.scss
 create mode 100644 theme-compiler/tests/resources/automatic/scss/utf8.scss

diff --git a/theme-compiler/src/com/vaadin/sass/internal/ScssStylesheet.java b/theme-compiler/src/com/vaadin/sass/internal/ScssStylesheet.java
index e915bdca7e..fd00dbff2c 100644
--- a/theme-compiler/src/com/vaadin/sass/internal/ScssStylesheet.java
+++ b/theme-compiler/src/com/vaadin/sass/internal/ScssStylesheet.java
@@ -58,6 +58,8 @@ public class ScssStylesheet extends Node {
 
     private String fileName;
 
+    private String charset;
+
     /**
      * Read in a file SCSS and parse it into a ScssStylesheet
      * 
@@ -69,7 +71,7 @@ public class ScssStylesheet extends Node {
     }
 
     /**
-     * Main entry point for the SASS compiler. Takes in a file and builds upp a
+     * Main entry point for the SASS compiler. Takes in a file and builds up a
      * ScssStylesheet tree out of it. Calling compile() on it will transform
      * SASS into CSS. Calling toString() will print out the SCSS/CSS.
      * 
@@ -80,6 +82,29 @@ public class ScssStylesheet extends Node {
      */
     public static ScssStylesheet get(String identifier) throws CSSException,
             IOException {
+        return get(identifier, null);
+    }
+
+    /**
+     * Main entry point for the SASS compiler. Takes in a file and encoding then
+     * builds up a ScssStylesheet tree out of it. Calling compile() on it will
+     * transform SASS into CSS. Calling toString() will print out the SCSS/CSS.
+     * 
+     * @param file
+     * @param encoding
+     * @return
+     * @throws CSSException
+     * @throws IOException
+     */
+    public static ScssStylesheet get(String identifier, String encoding)
+            throws CSSException, IOException {
+        /*
+         * The encoding to be used is passed through "encoding" parameter. the
+         * imported children scss node will have the same encoding as their
+         * parent, ultimately the root scss file. The root scss node has this
+         * "encoding" parameter to be null. Its encoding is determined by the
+         * @charset declaration, the default one is ASCII.
+         */
         File file = new File(identifier);
         file = file.getCanonicalFile();
 
@@ -90,12 +115,14 @@ public class ScssStylesheet extends Node {
         if (source == null) {
             return null;
         }
+        source.setEncoding(encoding);
 
         Parser parser = new Parser();
         parser.setErrorHandler(new SCSSErrorHandler());
         parser.setDocumentHandler(handler);
         parser.parseStyleSheet(source);
 
+        stylesheet.setCharset(parser.getInputSource().getEncoding());
         return stylesheet;
     }
 
@@ -169,10 +196,15 @@ public class ScssStylesheet extends Node {
     @Override
     public String toString() {
         StringBuilder string = new StringBuilder("");
+        String delimeter = "\n\n";
+        // add charset declaration, if it is not default "ASCII".
+        if (!"ASCII".equals(getCharset())) {
+            string.append("@charset \"").append(getCharset()).append("\";")
+                    .append(delimeter);
+        }
         if (children.size() > 0) {
             string.append(children.get(0).toString());
         }
-        String delimeter = "\n\n";
         if (children.size() > 1) {
             for (int i = 1; i < children.size(); i++) {
                 String childString = children.get(i).toString();
@@ -318,4 +350,12 @@ public class ScssStylesheet extends Node {
     public static final void warning(String msg) {
         Logger.getLogger(ScssStylesheet.class.getName()).warning(msg);
     }
+
+    public String getCharset() {
+        return charset;
+    }
+
+    public void setCharset(String charset) {
+        this.charset = charset;
+    }
 }
diff --git a/theme-compiler/src/com/vaadin/sass/internal/parser/Parser.java b/theme-compiler/src/com/vaadin/sass/internal/parser/Parser.java
index 0188926636..70fab6413a 100644
--- a/theme-compiler/src/com/vaadin/sass/internal/parser/Parser.java
+++ b/theme-compiler/src/com/vaadin/sass/internal/parser/Parser.java
@@ -70,6 +70,10 @@ public class Parser implements org.w3c.css.sac.Parser, ParserConstants {
         throw new CSSException(CSSException.SAC_NOT_SUPPORTED_ERR);
     }
 
+    public InputSource getInputSource(){
+        return source;
+    }
+
     /**
      * Set the document handler for this parser
      */
@@ -278,65 +282,70 @@ public class Parser implements org.w3c.css.sac.Parser, ParserConstants {
                 }
             }
         }
-        String encoding = "ASCII";
+        //use UTF-8 as the default encoding.
+        String encoding = source.getEncoding();
         InputStream input = source.getByteStream();
-        char c = ' ';
-
         if (!input.markSupported()) {
             input = new BufferedInputStream(input);
             source.setByteStream(input);
+            input.mark(100);
         }
-        input.mark(100);
-        c = (char) input.read();
-
-        if (c == '@') {
-            // hum, is it a charset ?
-            int size   = 100;
-            byte[] buf = new byte[size];
-            input.read(buf, 0, 7);
-            String keyword = new String(buf, 0, 7);
-            if (keyword.equals("charset")) {
-                // Yes, this is the charset declaration !
-
-                // here I don't use the right declaration : white space are ' '.
-                while ((c = (char) input.read()) == ' ') {
-                    // find the first quote
-                }
-                char endChar = c;
-                int i = 0;
+        if(encoding == null){
+            encoding = "ASCII";
 
-                if ((endChar != '"') && (endChar != '\u005c'')) {
-                    // hum this is not a quote.
-                    throw new CSSException("invalid charset declaration");
-                }
+            char c = ' ';
+
+            c = (char) input.read();
 
-                while ((c = (char) input.read()) != endChar) {
-                    buf[i++] = (byte) c;
-                    if (i == size) {
-                        byte[] old = buf;
-                        buf = new byte[size + 100];
-                        System.arraycopy(old, 0, buf, 0, size);
-                        size += 100;
+            if (c == '@') {
+                // hum, is it a charset ?
+                int size   = 100;
+                byte[] buf = new byte[size];
+                input.read(buf, 0, 7);
+                String keyword = new String(buf, 0, 7);
+                if (keyword.equals("charset")) {
+                    // Yes, this is the charset declaration !
+
+                    // here I don't use the right declaration : white space are ' '.
+                    while ((c = (char) input.read()) == ' ') {
+                        // find the first quote
                     }
-                }
-                while ((c = (char) input.read()) == ' ') {
-                    // find the next relevant character
-                }
-                if (c != ';') {
-                    // no semi colon at the end ?
-                    throw new CSSException("invalid charset declaration: "
+                    char endChar = c;
+                    int i = 0;
+
+                    if ((endChar != '"') && (endChar != '\u005c'')) {
+                        // hum this is not a quote.
+                        throw new CSSException("invalid charset declaration");
+                    }
+
+                    while ((c = (char) input.read()) != endChar) {
+                        buf[i++] = (byte) c;
+                        if (i == size) {
+                            byte[] old = buf;
+                            buf = new byte[size + 100];
+                            System.arraycopy(old, 0, buf, 0, size);
+                            size += 100;
+                        }
+                    }
+                    while ((c = (char) input.read()) == ' ') {
+                        // find the next relevant character
+                    }
+                    if (c != ';') {
+                        // no semi colon at the end ?
+                        throw new CSSException("invalid charset declaration: "
                                            + "missing semi colon");
-                }
-                encoding = new String(buf, 0, i);
-                if (source.getEncoding() != null) {
-                    // compare the two encoding informations.
-                    // For example, I don't accept to have ASCII and after UTF-8.
-                    // Is it really good ? That is the question.
-                    if (!encoding.equals(source.getEncoding())) {
-                        throw new CSSException("invalid encoding information.");
                     }
-                }
-            } // else no charset declaration available
+                    encoding = new String(buf, 0, i);
+                    if (source.getEncoding() != null) {
+                        // compare the two encoding informations.
+                        // For example, I don't accept to have ASCII and after UTF-8.
+                        // Is it really good ? That is the question.
+                        if (!encoding.equals(source.getEncoding())) {
+                            throw new CSSException("invalid encoding information.");
+                        }
+                    }
+                } // else no charset declaration available
+            }
         }
         // ok set the real encoding of this source.
         source.setEncoding(encoding);
@@ -5571,26 +5580,10 @@ LexicalUnitImpl result = null;
                 case '5': case '6': case '7': case '8': case '9':
                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-                    int numValue = Character.digit(c, 16);
-                    int count = 0;
-                    int p = 16;
-
-                    while (index + 1 < len && count < 6) {
-                        c = s.charAt(index+1);
-
-                        if (Character.digit(c, 16) != -1) {
-                            numValue = (numValue * 16) + Character.digit(c, 16);
-                            p *= 16;
-                            index++;
-                        } else {
-                            if (c == ' ') {
-                                // skip the latest white space
-                                index++;
-                            }
-                            break;
-                        }
+                    buf.append('\u005c\u005c');
+                    while (index < len) {
+                        buf.append(s.charAt(index++));
                     }
-                    buf.append((char) numValue);
                     break;
                 case '\u005cn':
                 case '\u005cf':
@@ -5920,34 +5913,6 @@ LexicalUnitImpl result = null;
     finally { jj_save(12, xla); }
   }
 
-  private boolean jj_3R_196() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_3_3()) {
-    jj_scanpos = xsp;
-    if (jj_3R_239()) {
-    jj_scanpos = xsp;
-    if (jj_3R_240()) return true;
-    }
-    }
-    return false;
-  }
-
-  private boolean jj_3_3() {
-    if (jj_3R_166()) return true;
-    return false;
-  }
-
-  private boolean jj_3_6() {
-    if (jj_3R_169()) return true;
-    return false;
-  }
-
-  private boolean jj_3_1() {
-    if (jj_3R_165()) return true;
-    return false;
-  }
-
   private boolean jj_3R_368() {
     Token xsp;
     xsp = jj_scanpos;
@@ -7939,6 +7904,34 @@ LexicalUnitImpl result = null;
     return false;
   }
 
+  private boolean jj_3R_196() {
+    Token xsp;
+    xsp = jj_scanpos;
+    if (jj_3_3()) {
+    jj_scanpos = xsp;
+    if (jj_3R_239()) {
+    jj_scanpos = xsp;
+    if (jj_3R_240()) return true;
+    }
+    }
+    return false;
+  }
+
+  private boolean jj_3_3() {
+    if (jj_3R_166()) return true;
+    return false;
+  }
+
+  private boolean jj_3_6() {
+    if (jj_3R_169()) return true;
+    return false;
+  }
+
+  private boolean jj_3_1() {
+    if (jj_3R_165()) return true;
+    return false;
+  }
+
   /** Generated Token Manager. */
   public ParserTokenManager token_source;
   /** Current token. */
diff --git a/theme-compiler/src/com/vaadin/sass/internal/parser/Parser.jj b/theme-compiler/src/com/vaadin/sass/internal/parser/Parser.jj
index c26407c196..4e76022ae1 100644
--- a/theme-compiler/src/com/vaadin/sass/internal/parser/Parser.jj
+++ b/theme-compiler/src/com/vaadin/sass/internal/parser/Parser.jj
@@ -87,6 +87,10 @@ public class Parser implements org.w3c.css.sac.Parser {
     public void setLocale(Locale locale) throws CSSException {
 	throw new CSSException(CSSException.SAC_NOT_SUPPORTED_ERR);
     }
+    
+    public InputSource getInputSource(){
+        return source;
+    }
 
     /**
      * Set the document handler for this parser
@@ -296,65 +300,70 @@ public class Parser implements org.w3c.css.sac.Parser {
 		}
 	    }
 	}
-	String encoding = "ASCII";
+	//use UTF-8 as the default encoding.
+	String encoding = source.getEncoding();
 	InputStream input = source.getByteStream();
-	char c = ' ';
-
 	if (!input.markSupported()) {
-	    input = new BufferedInputStream(input);
-	    source.setByteStream(input);
-	}
-	input.mark(100);
-	c = (char) input.read();
-
-	if (c == '@') {
-	    // hum, is it a charset ?
-	    int size   = 100;
-	    byte[] buf = new byte[size];
-	    input.read(buf, 0, 7);
-	    String keyword = new String(buf, 0, 7);
-	    if (keyword.equals("charset")) {
-		// Yes, this is the charset declaration !
-
-		// here I don't use the right declaration : white space are ' '.
-		while ((c = (char) input.read()) == ' ') {
-		    // find the first quote
-		}
-		char endChar = c;
-		int i = 0;
+            input = new BufferedInputStream(input);
+            source.setByteStream(input);
+            input.mark(100);
+        }
+	if(encoding == null){
+	    encoding = "ASCII";
+	
+	    char c = ' ';
+
+	    c = (char) input.read();
+
+	    if (c == '@') {
+	        // hum, is it a charset ?
+	        int size   = 100;
+	        byte[] buf = new byte[size];
+	        input.read(buf, 0, 7);
+	        String keyword = new String(buf, 0, 7);
+	        if (keyword.equals("charset")) {
+	            // Yes, this is the charset declaration !
+
+	            // here I don't use the right declaration : white space are ' '.
+	            while ((c = (char) input.read()) == ' ') {
+		        // find the first quote
+		    }
+		    char endChar = c;
+		    int i = 0;
 
-		if ((endChar != '"') && (endChar != '\'')) {
-		    // hum this is not a quote.
-		    throw new CSSException("invalid charset declaration");
-		}
+		    if ((endChar != '"') && (endChar != '\'')) {
+		        // hum this is not a quote.
+		        throw new CSSException("invalid charset declaration");
+		    }
 
-		while ((c = (char) input.read()) != endChar) {
-		    buf[i++] = (byte) c;
-		    if (i == size) {
-			byte[] old = buf;
-			buf = new byte[size + 100];
-			System.arraycopy(old, 0, buf, 0, size);
-			size += 100;
+		    while ((c = (char) input.read()) != endChar) {
+		        buf[i++] = (byte) c;
+		        if (i == size) {
+		            byte[] old = buf;
+			    buf = new byte[size + 100];
+			    System.arraycopy(old, 0, buf, 0, size);
+			    size += 100;
+		        }
 		    }
-		}
-		while ((c = (char) input.read()) == ' ') {
-		    // find the next relevant character
-		}
-		if (c != ';') {
-		    // no semi colon at the end ?
-		    throw new CSSException("invalid charset declaration: "
+    		    while ((c = (char) input.read()) == ' ') {
+    		        // find the next relevant character
+    		    }
+		    if (c != ';') {
+		        // no semi colon at the end ?
+		        throw new CSSException("invalid charset declaration: "
 					   + "missing semi colon");
-		}
-		encoding = new String(buf, 0, i);
-		if (source.getEncoding() != null) {
-		    // compare the two encoding informations.
-		    // For example, I don't accept to have ASCII and after UTF-8.
-		    // Is it really good ? That is the question.
-		    if (!encoding.equals(source.getEncoding())) {
-			throw new CSSException("invalid encoding information.");
 		    }
-		}
-	    } // else no charset declaration available
+		    encoding = new String(buf, 0, i);
+		    if (source.getEncoding() != null) {
+		        // compare the two encoding informations.
+		        // For example, I don't accept to have ASCII and after UTF-8.
+		        // Is it really good ? That is the question.
+		        if (!encoding.equals(source.getEncoding())) {
+		            throw new CSSException("invalid encoding information.");
+		        }
+		    }
+	        } // else no charset declaration available
+	    }
 	}
 	// ok set the real encoding of this source.
 	source.setEncoding(encoding);
@@ -2710,26 +2719,10 @@ String convertStringIndex(String s, int start, int len) {
 		case '5': case '6': case '7': case '8': case '9':
 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-		    int numValue = Character.digit(c, 16);
-		    int count = 0;
-		    int p = 16;
-
-		    while (index + 1 < len && count < 6) {
-			c = s.charAt(index+1);
-
-			if (Character.digit(c, 16) != -1) {
-			    numValue = (numValue * 16) + Character.digit(c, 16);
-			    p *= 16;
-			    index++;
-			} else {
-			    if (c == ' ') {
-				// skip the latest white space
-				index++;
-			    }
-			    break;
-			}
+		    buf.append('\\');
+		    while (index < len) {
+			buf.append(s.charAt(index++));
 		    }
-		    buf.append((char) numValue);
 		    break;
 		case '\n':
 		case '\f':
diff --git a/theme-compiler/src/com/vaadin/sass/internal/visitor/ImportNodeHandler.java b/theme-compiler/src/com/vaadin/sass/internal/visitor/ImportNodeHandler.java
index 946d56ba89..5593241297 100644
--- a/theme-compiler/src/com/vaadin/sass/internal/visitor/ImportNodeHandler.java
+++ b/theme-compiler/src/com/vaadin/sass/internal/visitor/ImportNodeHandler.java
@@ -48,8 +48,9 @@ public class ImportNodeHandler {
                             filePathBuilder.append(".scss");
                         }
 
-                        ScssStylesheet imported = ScssStylesheet
-                                .get(filePathBuilder.toString());
+                        // set parent's charset to imported node.
+                        ScssStylesheet imported = ScssStylesheet.get(
+                                filePathBuilder.toString(), node.getCharset());
                         if (imported == null) {
                             imported = ScssStylesheet.get(importNode.getUri());
                         }
diff --git a/theme-compiler/tests/resources/automatic/css/utf8.css b/theme-compiler/tests/resources/automatic/css/utf8.css
new file mode 100644
index 0000000000..b27d6cedf9
--- /dev/null
+++ b/theme-compiler/tests/resources/automatic/css/utf8.css
@@ -0,0 +1,5 @@
+@charset "UTF-8";
+.imported { content: "\1f4c5"; }
+.imported_raw_utf { content: "â¥"; }
+.bar { content: "\1f4c5"; }
+.raw_utf { content: "ð"; }
\ No newline at end of file
diff --git a/theme-compiler/tests/resources/automatic/scss/utf8-imported/to-be-imported-scss-file-contains-utf8.scss b/theme-compiler/tests/resources/automatic/scss/utf8-imported/to-be-imported-scss-file-contains-utf8.scss
new file mode 100644
index 0000000000..f8a08a4a96
--- /dev/null
+++ b/theme-compiler/tests/resources/automatic/scss/utf8-imported/to-be-imported-scss-file-contains-utf8.scss
@@ -0,0 +1,3 @@
+@charset "abc";
+.imported{content: '\1f4c5';}
+.imported_raw_utf{content: "â¥";}
diff --git a/theme-compiler/tests/resources/automatic/scss/utf8.scss b/theme-compiler/tests/resources/automatic/scss/utf8.scss
new file mode 100644
index 0000000000..b568674073
--- /dev/null
+++ b/theme-compiler/tests/resources/automatic/scss/utf8.scss
@@ -0,0 +1,4 @@
+@charset "UTF-8";
+@import "utf8-imported/to-be-imported-scss-file-contains-utf8";
+.bar {content: "\1f4c5";}
+.raw_utf {content: "ð";}
\ No newline at end of file
-- 
2.39.5