Add primitive support for writing unicode compressed text columns.

author James Ahlborn <jtahlborn@yahoo.com>

Tue, 22 Jul 2008 01:35:04 +0000 (01:35 +0000)

committer James Ahlborn <jtahlborn@yahoo.com>

Tue, 22 Jul 2008 01:35:04 +0000 (01:35 +0000)
author James Ahlborn <jtahlborn@yahoo.com>
Tue, 22 Jul 2008 01:35:04 +0000 (01:35 +0000)
committer James Ahlborn <jtahlborn@yahoo.com>
Tue, 22 Jul 2008 01:35:04 +0000 (01:35 +0000)
diff --git a/TODO.txt b/TODO.txt

index dcd75b503bfd8d30f16bbde70806ae58c6fdd44c..a0e6e58c53a2023f9c1016d584751a0c77a7cd00 100644 (file)
--- a/TODO.txt
+++ b/TODO.txt
@@ -2,9 +2,6 @@ Missing pieces:
  
  - handle more text index entry types (extended charsets, alternate encodings)
    * MEDIUM
-- determine when/when not to inline memo/ole data for a row of data depending
-  on how much will be written in the current row
-  * MEDIUM
  - handle row "updates": write row data back to current row, handling overflow
    * MEDIUM
  - implement index creation
@@ -14,4 +11,4 @@ Missing pieces:
    * EASY
  - implement table, column, index deletion
    * EASY - MEDIUM
-
+- implement more comprehensive unicode compression write support
diff --git a/src/changes/changes.xml b/src/changes/changes.xml

index 2aa3336b3d4af6850b084a6f7915223ecb587522..cef026e1022c3bbd51fb8fda7875fd2d2d7c3d49 100644 (file)
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -22,6 +22,9 @@
          Share out-of-line long value pages in order to generate more compact
          database files.
        </action>
+      <action dev="jahlborn" type="add">
+        Add primitive support for writing unicode compressed text columns.
+      </action>
      </release>
      <release version="1.1.15" date="2008-06-27">
        <action dev="jahlborn" type="fix" issue="1998225">
diff --git a/src/java/com/healthmarketscience/jackcess/Column.java b/src/java/com/healthmarketscience/jackcess/Column.java

index 528ce12441c35a1f251c87912fb4df21e0f2e163..8d27ae4361a7ab993a3e63e86c4739973ac7a5d1 100644 (file)
--- a/src/java/com/healthmarketscience/jackcess/Column.java
+++ b/src/java/com/healthmarketscience/jackcess/Column.java
@@ -96,9 +96,15 @@ public class Column implements Comparable<Column> {
    
    /** mask for the unknown bit */
    public static final byte UNKNOWN_FLAG_MASK = (byte)0x02;
-  
+
+  /** pattern matching textual guid strings (allows for optional surrounding
+      '{' and '}') */
    private static final Pattern GUID_PATTERN = Pattern.compile("\\s*[{]?([\\p{XDigit}]{8})-([\\p{XDigit}]{4})-([\\p{XDigit}]{4})-([\\p{XDigit}]{4})-([\\p{XDigit}]{12})[}]?\\s*");
  
+  /** header used to indicate unicode text compression */
+  private static final byte[] TEXT_COMPRESSION_HEADER = 
+  { (byte)0xFF, (byte)0XFE };
+
    /** owning table */
    private final Table _table;
    /** For text columns, whether or not they are compressed */ 
@@ -266,6 +272,10 @@ public class Column implements Comparable<Column> {
      return _compressedUnicode;
    }
  
+  public void setCompressedUnicode(boolean newCompessedUnicode) {
+    _compressedUnicode = newCompessedUnicode;
+  }
+
    public byte getPrecision() {
      return _precision;
    }
@@ -357,6 +367,14 @@ public class Column implements Comparable<Column> {
              "Auto number column must be long integer");
        }
      }
+
+    if(isCompressedUnicode()) {
+      if((getType() != DataType.TEXT) &&
+         (getType() != DataType.MEMO)) {
+        throw new IllegalArgumentException(
+            "Only textual columns allow unicode compression (text/memo)");
+      }
+    }
    }
    
    /**
@@ -949,7 +967,7 @@ public class Column implements Comparable<Column> {
            throw new IOException("Text is too big for column, max " + maxChars
                                  + ", got " + text.length());
          }
-        byte[] encodedData = encodeUncompressedText(text, getFormat()).array();
+        byte[] encodedData = encodeTextValue(text).array();
          obj = encodedData;
          break;
          
@@ -972,7 +990,7 @@ public class Column implements Comparable<Column> {
        // should already be "encoded"
        break;
      case MEMO:
-      obj = encodeUncompressedText(toCharSequence(obj), getFormat()).array();
+      obj = encodeTextValue(toCharSequence(obj)).array();
        break;
      default:
        throw new RuntimeException("unexpected var length, long value type: " +
@@ -1052,8 +1070,9 @@ public class Column implements Comparable<Column> {
        // see if data is compressed.  the 0xFF, 0xFE sequence indicates that
        // compression is used (sort of, see algorithm below)
        boolean isCompressed = ((data.length > 1) &&
-                              (data[0] == (byte)0xFF) &&
-                              (data[1] == (byte)0xFE));
+                              (data[0] == TEXT_COMPRESSION_HEADER[0]) &&
+                              (data[1] == TEXT_COMPRESSION_HEADER[1]));
+
        if(isCompressed) {
  
          Expand expander = new Expand();
@@ -1063,7 +1082,7 @@ public class Column implements Comparable<Column> {
          // compressed mode)
          StringBuilder textBuf = new StringBuilder(data.length);
          // start after two bytes indicating compression use
-        int dataStart = 2;
+        int dataStart = TEXT_COMPRESSION_HEADER.length;
          int dataEnd = dataStart;
          boolean inCompressedMode = true;
          while(dataEnd < data.length) {
@@ -1136,6 +1155,64 @@ public class Column implements Comparable<Column> {
      return format.CHARSET.decode(ByteBuffer.wrap(textBytes, startPos, length));
    }  
  
+  /**
+   * Encodes a text value, possibly compressing.
+   */
+  private ByteBuffer encodeTextValue(CharSequence text)
+    throws IOException
+  {
+    // may only compress if column type allows it
+    if(isCompressedUnicode()) {
+
+      // for now, only do very simple compression (only compress text which is
+      // all ascii text)
+      if(isAsciiCompressible(text)) {
+
+        byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length + 
+                                       text.length()];
+        encodedChars[0] = TEXT_COMPRESSION_HEADER[0];
+        encodedChars[1] = TEXT_COMPRESSION_HEADER[1];
+        for(int i = 0; i < text.length(); ++i) {
+          encodedChars[i + TEXT_COMPRESSION_HEADER.length] = 
+            (byte)text.charAt(i);
+        }
+        return ByteBuffer.wrap(encodedChars);
+      }
+    }
+
+    return encodeUncompressedText(text, getFormat());
+  }
+
+  /**
+   * Returns {@code true} if the given text can be compressed using simple
+   * ASCII encoding, {@code false} otherwise.
+   */
+  private static boolean isAsciiCompressible(CharSequence text) {
+    // only attempt to compress > 2 chars (compressing less than 3 chars would
+    // not result in a space savings due to the 2 byte compression header)
+    if(text.length() <= TEXT_COMPRESSION_HEADER.length) {
+      return false;
+    }
+    // now, see if it is all printable ASCII
+    for(int i = 0; i < text.length(); ++i) {
+      char c = text.charAt(i);
+      if(!isAsciiCrLfOrTab(c)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Returns true if the character is ASCII, but not a control other than
+   * CR, LF and TAB
+   */
+  private static boolean isAsciiCrLfOrTab(int ch)
+  {
+    return ((ch >= 0x20 && ch <= 0x7F)                  // ASCII (non control)
+            || ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB
+  }
+
    @Override
    public String toString() {
      StringBuilder rtn = new StringBuilder();
diff --git a/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java b/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java

index b2ec9883a68c2e4a8574fb3320ecfc6157199b43..97fca0ab3b92f18cbd59b5f6abc7d47c7d4c796d 100644 (file)
--- a/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java
+++ b/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java
@@ -48,6 +48,8 @@ public class ColumnBuilder {
    private Integer _scale;
    /** whether or not the column is auto-number */
    private boolean _autoNumber;
+  /** whether or not the column allows compressed unicode */
+  private Boolean _compressedUnicode;
  
    public ColumnBuilder(String name) {
      this(name, null);
@@ -129,6 +131,14 @@ public class ColumnBuilder {
      return this;
    }
  
+  /**
+   * Sets whether of not the new column allows unicode compression.
+   */
+  public ColumnBuilder setCompressedUnicode(boolean compressedUnicode) {
+    _compressedUnicode = compressedUnicode;
+    return this;
+  }
+
    /**
     * Sets all attributes except name from the given Column template.
     */
@@ -164,6 +174,9 @@ public class ColumnBuilder {
      if(_autoNumber) {
        col.setAutoNumber(true);
      }
+    if(_compressedUnicode != null) {
+      col.setCompressedUnicode(_compressedUnicode);
+    }
      return col;
    }
    
diff --git a/test/src/java/com/healthmarketscience/jackcess/TableTest.java b/test/src/java/com/healthmarketscience/jackcess/TableTest.java

index 85b6bf7fc8acc2194bb1023f32c75fb196046e2f..de8de5cf1af5456a5397ebd2fbaf834c6c40f451 100644 (file)
--- a/test/src/java/com/healthmarketscience/jackcess/TableTest.java
+++ b/test/src/java/com/healthmarketscience/jackcess/TableTest.java
@@ -27,8 +27,10 @@ King of Prussia, PA 19406
  
  package com.healthmarketscience.jackcess;
  
+import java.io.IOException;
  import java.nio.ByteBuffer;
  import java.util.ArrayList;
+import java.util.Arrays;
  import java.util.List;
  
  import junit.framework.TestCase;
@@ -37,37 +39,30 @@ import junit.framework.TestCase;
   * @author Tim McCune
   */
  public class TableTest extends TestCase {
+
+  private final PageChannel _pageChannel = new PageChannel(true);
+  private List<Column> _columns = new ArrayList<Column>();
+  private Table _testTable;
    
    public TableTest(String name) {
      super(name);
    }
    
    public void testCreateRow() throws Exception {
-    final JetFormat format = JetFormat.VERSION_4;
-    final PageChannel pageChannel = new PageChannel(true);
-    List<Column> columns = new ArrayList<Column>();
-    Column col = newTestColumn(pageChannel);
+    Column col = newTestColumn();
      col.setType(DataType.INT);
-    columns.add(col);
-    col = newTestColumn(pageChannel);
+    _columns.add(col);
+    col = newTestColumn();
      col.setType(DataType.TEXT);
-    columns.add(col);
-    col = newTestColumn(pageChannel);
+    _columns.add(col);
+    col = newTestColumn();
      col.setType(DataType.TEXT);
-    columns.add(col);
-    Table table = new Table(true, columns) {
-        @Override
-        public PageChannel getPageChannel() {
-          return pageChannel;
-        }
-      };
-    int colCount = 3;
-    Object[] row = new Object[colCount];
-    row[0] = new Short((short) 9);
-    row[1] = "Tim";
-    row[2] = "McCune";
-    ByteBuffer buffer = table.createRow(row, format.MAX_ROW_SIZE,
-                                        pageChannel.createPageBuffer());
+    _columns.add(col);
+    newTestTable();
+    
+    int colCount = _columns.size();
+    ByteBuffer buffer = createRow(9, "Tim", "McCune");
+
      assertEquals((short) colCount, buffer.getShort());
      assertEquals((short) 9, buffer.getShort());
      assertEquals((byte) 'T', buffer.get());
@@ -78,17 +73,107 @@ public class TableTest extends TestCase {
      assertEquals((byte) 7, buffer.get(30));
    }
  
-  private static Column newTestColumn(final PageChannel pageChannel) {
-    return new Column(true) {
+  public void testUnicodeCompression() throws Exception {
+    Column col = newTestColumn();
+    col = newTestColumn();
+    col.setType(DataType.TEXT);
+    _columns.add(col);
+    col = newTestColumn();
+    col.setType(DataType.MEMO);
+    _columns.add(col);
+    newTestTable();
+
+    String small = "this is a string";
+    String smallNotAscii = "this is a string\0";
+    String large = DatabaseTest.createString(30);
+    String largeNotAscii = large + "\0";
+
+    ByteBuffer[] buf1 = encodeColumns(small, large);
+    ByteBuffer[] buf2 = encodeColumns(smallNotAscii, largeNotAscii);
+
+    for(Column tmp : _columns) {
+      tmp.setCompressedUnicode(true);
+    }
+    
+    ByteBuffer[] bufCmp1 = encodeColumns(small, large);
+    ByteBuffer[] bufCmp2 = encodeColumns(smallNotAscii, largeNotAscii);
+    
+    assertEquals(buf1[0].remaining(),
+                 (bufCmp1[0].remaining() + small.length() - 2));
+    assertEquals(buf1[1].remaining(),
+                 (bufCmp1[1].remaining() + large.length() - 2));
+
+    for(int i = 0; i < buf2.length; ++i) {
+      assertTrue(Arrays.equals(toBytes(buf2[i]), toBytes(bufCmp2[i])));
+    }
+
+    assertEquals(Arrays.asList(small, large),
+                 Arrays.asList(decodeColumns(bufCmp1)));
+    assertEquals(Arrays.asList(smallNotAscii, largeNotAscii),
+                 Arrays.asList(decodeColumns(bufCmp2)));
+
+  }
+
+  private ByteBuffer createRow(Object... row) 
+    throws IOException
+  {
+    return _testTable.createRow(
+        row, _testTable.getFormat().MAX_ROW_SIZE,
+        _testTable.getPageChannel().createPageBuffer());
+  }
+
+  private ByteBuffer[] encodeColumns(Object... row)
+    throws IOException
+  {
+    ByteBuffer[] result = new ByteBuffer[_columns.size()];
+    for(int i = 0; i < _columns.size(); ++i) {
+      Column col = _columns.get(i);
+      result[i] = col.write(row[i], _testTable.getFormat().MAX_ROW_SIZE);
+    }
+    return result;
+  }
+
+  private Object[] decodeColumns(ByteBuffer[] buffers)
+    throws IOException
+  {
+    Object[] result = new Object[_columns.size()];
+    for(int i = 0; i < _columns.size(); ++i) {
+      Column col = _columns.get(i);
+      result[i] = col.read(toBytes(buffers[i]));
+    }
+    return result;
+  }
+
+  private static byte[] toBytes(ByteBuffer buffer) {
+    buffer.rewind();
+    byte[] b = new byte[buffer.remaining()];
+    buffer.get(b);
+    return b;
+  }
+
+  private Table newTestTable() 
+    throws Exception
+  {
+    _testTable = new Table(true, _columns) {
          @Override
          public PageChannel getPageChannel() {
-          return pageChannel;
+          return _pageChannel;
          }
          @Override
          public JetFormat getFormat() {
            return JetFormat.VERSION_4;
          }
        };
+    return _testTable;
+  }
+
+  private Column newTestColumn() {
+    return new Column(true) {
+        @Override
+        public Table getTable() {
+          return _testTable;
+        }
+      };
    }
    
  }
author	James Ahlborn <jtahlborn@yahoo.com>
	Tue, 22 Jul 2008 01:35:04 +0000 (01:35 +0000)
committer	James Ahlborn <jtahlborn@yahoo.com>
	Tue, 22 Jul 2008 01:35:04 +0000 (01:35 +0000)
TODO.txt		patch \| blob \| history
src/changes/changes.xml		patch \| blob \| history
src/java/com/healthmarketscience/jackcess/Column.java		patch \| blob \| history
src/java/com/healthmarketscience/jackcess/ColumnBuilder.java		patch \| blob \| history
test/src/java/com/healthmarketscience/jackcess/TableTest.java		patch \| blob \| history