Add primitive support for writing unicode compressed text columns.

git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@364 f203690c-595d-4dc9-a70b-905162fa7fd2
author: James Ahlborn <jtahlborn@yahoo.com> 2008-07-22 01:35:04 +0000
committer: James Ahlborn <jtahlborn@yahoo.com> 2008-07-22 01:35:04 +0000
commit: d1359adfbf6199a305b062c0a8da0d05e49e84a7 (patch)
tree: 44f0b120ad5a87d55fb3764741cf8af949670a9a /src
parent: 050db7f0f79f0fa33220042814fcb61b538ba8dd (diff)
download: jackcess-d1359adfbf6199a305b062c0a8da0d05e49e84a7.tar.gz
jackcess-d1359adfbf6199a305b062c0a8da0d05e49e84a7.zip
3 files changed, 99 insertions, 6 deletions
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 2aa3336..cef026e 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -22,6 +22,9 @@
         Share out-of-line long value pages in order to generate more compact
         database files.
       </action>
+      <action dev="jahlborn" type="add">
+        Add primitive support for writing unicode compressed text columns.
+      </action>
     </release>
     <release version="1.1.15" date="2008-06-27">
       <action dev="jahlborn" type="fix" issue="1998225">
diff --git a/src/java/com/healthmarketscience/jackcess/Column.java b/src/java/com/healthmarketscience/jackcess/Column.java
index 528ce12..8d27ae4 100644
--- a/src/java/com/healthmarketscience/jackcess/Column.java
+++ b/src/java/com/healthmarketscience/jackcess/Column.java
@@ -96,9 +96,15 @@ public class Column implements Comparable<Column> {
   
   /** mask for the unknown bit */
   public static final byte UNKNOWN_FLAG_MASK = (byte)0x02;
-  
+
+  /** pattern matching textual guid strings (allows for optional surrounding
+      '{' and '}') */
   private static final Pattern GUID_PATTERN = Pattern.compile("\\s*[{]?([\\p{XDigit}]{8})-([\\p{XDigit}]{4})-([\\p{XDigit}]{4})-([\\p{XDigit}]{4})-([\\p{XDigit}]{12})[}]?\\s*");
 
+  /** header used to indicate unicode text compression */
+  private static final byte[] TEXT_COMPRESSION_HEADER = 
+  { (byte)0xFF, (byte)0XFE };
+
   /** owning table */
   private final Table _table;
   /** For text columns, whether or not they are compressed */ 
@@ -266,6 +272,10 @@ public class Column implements Comparable<Column> {
     return _compressedUnicode;
   }
 
+  public void setCompressedUnicode(boolean newCompessedUnicode) {
+    _compressedUnicode = newCompessedUnicode;
+  }
+
   public byte getPrecision() {
     return _precision;
   }
@@ -357,6 +367,14 @@ public class Column implements Comparable<Column> {
             "Auto number column must be long integer");
       }
     }
+
+    if(isCompressedUnicode()) {
+      if((getType() != DataType.TEXT) &&
+         (getType() != DataType.MEMO)) {
+        throw new IllegalArgumentException(
+            "Only textual columns allow unicode compression (text/memo)");
+      }
+    }
   }
   
   /**
@@ -949,7 +967,7 @@ public class Column implements Comparable<Column> {
           throw new IOException("Text is too big for column, max " + maxChars
                                 + ", got " + text.length());
         }
-        byte[] encodedData = encodeUncompressedText(text, getFormat()).array();
+        byte[] encodedData = encodeTextValue(text).array();
         obj = encodedData;
         break;
         
@@ -972,7 +990,7 @@ public class Column implements Comparable<Column> {
       // should already be "encoded"
       break;
     case MEMO:
-      obj = encodeUncompressedText(toCharSequence(obj), getFormat()).array();
+      obj = encodeTextValue(toCharSequence(obj)).array();
       break;
     default:
       throw new RuntimeException("unexpected var length, long value type: " +
@@ -1052,8 +1070,9 @@ public class Column implements Comparable<Column> {
       // see if data is compressed.  the 0xFF, 0xFE sequence indicates that
       // compression is used (sort of, see algorithm below)
       boolean isCompressed = ((data.length > 1) &&
-                              (data[0] == (byte)0xFF) &&
-                              (data[1] == (byte)0xFE));
+                              (data[0] == TEXT_COMPRESSION_HEADER[0]) &&
+                              (data[1] == TEXT_COMPRESSION_HEADER[1]));
+
       if(isCompressed) {
 
         Expand expander = new Expand();
@@ -1063,7 +1082,7 @@ public class Column implements Comparable<Column> {
         // compressed mode)
         StringBuilder textBuf = new StringBuilder(data.length);
         // start after two bytes indicating compression use
-        int dataStart = 2;
+        int dataStart = TEXT_COMPRESSION_HEADER.length;
         int dataEnd = dataStart;
         boolean inCompressedMode = true;
         while(dataEnd < data.length) {
@@ -1136,6 +1155,64 @@ public class Column implements Comparable<Column> {
     return format.CHARSET.decode(ByteBuffer.wrap(textBytes, startPos, length));
   }  
 
+  /**
+   * Encodes a text value, possibly compressing.
+   */
+  private ByteBuffer encodeTextValue(CharSequence text)
+    throws IOException
+  {
+    // may only compress if column type allows it
+    if(isCompressedUnicode()) {
+
+      // for now, only do very simple compression (only compress text which is
+      // all ascii text)
+      if(isAsciiCompressible(text)) {
+
+        byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length + 
+                                       text.length()];
+        encodedChars[0] = TEXT_COMPRESSION_HEADER[0];
+        encodedChars[1] = TEXT_COMPRESSION_HEADER[1];
+        for(int i = 0; i < text.length(); ++i) {
+          encodedChars[i + TEXT_COMPRESSION_HEADER.length] = 
+            (byte)text.charAt(i);
+        }
+        return ByteBuffer.wrap(encodedChars);
+      }
+    }
+
+    return encodeUncompressedText(text, getFormat());
+  }
+
+  /**
+   * Returns {@code true} if the given text can be compressed using simple
+   * ASCII encoding, {@code false} otherwise.
+   */
+  private static boolean isAsciiCompressible(CharSequence text) {
+    // only attempt to compress > 2 chars (compressing less than 3 chars would
+    // not result in a space savings due to the 2 byte compression header)
+    if(text.length() <= TEXT_COMPRESSION_HEADER.length) {
+      return false;
+    }
+    // now, see if it is all printable ASCII
+    for(int i = 0; i < text.length(); ++i) {
+      char c = text.charAt(i);
+      if(!isAsciiCrLfOrTab(c)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Returns true if the character is ASCII, but not a control other than
+   * CR, LF and TAB
+   */
+  private static boolean isAsciiCrLfOrTab(int ch)
+  {
+    return ((ch >= 0x20 && ch <= 0x7F)                  // ASCII (non control)
+            || ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB
+  }
+
   @Override
   public String toString() {
     StringBuilder rtn = new StringBuilder();
diff --git a/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java b/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java
index b2ec988..97fca0a 100644
--- a/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java
+++ b/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java
@@ -48,6 +48,8 @@ public class ColumnBuilder {
   private Integer _scale;
   /** whether or not the column is auto-number */
   private boolean _autoNumber;
+  /** whether or not the column allows compressed unicode */
+  private Boolean _compressedUnicode;
 
   public ColumnBuilder(String name) {
     this(name, null);
@@ -130,6 +132,14 @@ public class ColumnBuilder {
   }
 
   /**
+   * Sets whether of not the new column allows unicode compression.
+   */
+  public ColumnBuilder setCompressedUnicode(boolean compressedUnicode) {
+    _compressedUnicode = compressedUnicode;
+    return this;
+  }
+
+  /**
    * Sets all attributes except name from the given Column template.
    */
   public ColumnBuilder setFromColumn(Column template) {
@@ -164,6 +174,9 @@ public class ColumnBuilder {
     if(_autoNumber) {
       col.setAutoNumber(true);
     }
+    if(_compressedUnicode != null) {
+      col.setCompressedUnicode(_compressedUnicode);
+    }
     return col;
   }
author	James Ahlborn <jtahlborn@yahoo.com>	2008-07-22 01:35:04 +0000
committer	James Ahlborn <jtahlborn@yahoo.com>	2008-07-22 01:35:04 +0000
commit	d1359adfbf6199a305b062c0a8da0d05e49e84a7 (patch)
tree	44f0b120ad5a87d55fb3764741cf8af949670a9a /src
parent	050db7f0f79f0fa33220042814fcb61b538ba8dd (diff)
download	jackcess-d1359adfbf6199a305b062c0a8da0d05e49e84a7.tar.gz jackcess-d1359adfbf6199a305b062c0a8da0d05e49e84a7.zip