diff options
-rw-r--r-- | TODO.txt | 5 | ||||
-rw-r--r-- | src/changes/changes.xml | 3 | ||||
-rw-r--r-- | src/java/com/healthmarketscience/jackcess/Column.java | 89 | ||||
-rw-r--r-- | src/java/com/healthmarketscience/jackcess/ColumnBuilder.java | 13 | ||||
-rw-r--r-- | test/src/java/com/healthmarketscience/jackcess/TableTest.java | 135 |
5 files changed, 210 insertions, 35 deletions
@@ -2,9 +2,6 @@ Missing pieces: - handle more text index entry types (extended charsets, alternate encodings) * MEDIUM -- determine when/when not to inline memo/ole data for a row of data depending - on how much will be written in the current row - * MEDIUM - handle row "updates": write row data back to current row, handling overflow * MEDIUM - implement index creation @@ -14,4 +11,4 @@ Missing pieces: * EASY - implement table, column, index deletion * EASY - MEDIUM - +- implement more comprehensive unicode compression write support diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 2aa3336..cef026e 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -22,6 +22,9 @@ Share out-of-line long value pages in order to generate more compact database files. </action> + <action dev="jahlborn" type="add"> + Add primitive support for writing unicode compressed text columns. + </action> </release> <release version="1.1.15" date="2008-06-27"> <action dev="jahlborn" type="fix" issue="1998225"> diff --git a/src/java/com/healthmarketscience/jackcess/Column.java b/src/java/com/healthmarketscience/jackcess/Column.java index 528ce12..8d27ae4 100644 --- a/src/java/com/healthmarketscience/jackcess/Column.java +++ b/src/java/com/healthmarketscience/jackcess/Column.java @@ -96,9 +96,15 @@ public class Column implements Comparable<Column> { /** mask for the unknown bit */ public static final byte UNKNOWN_FLAG_MASK = (byte)0x02; - + + /** pattern matching textual guid strings (allows for optional surrounding + '{' and '}') */ private static final Pattern GUID_PATTERN = Pattern.compile("\\s*[{]?([\\p{XDigit}]{8})-([\\p{XDigit}]{4})-([\\p{XDigit}]{4})-([\\p{XDigit}]{4})-([\\p{XDigit}]{12})[}]?\\s*"); + /** header used to indicate unicode text compression */ + private static final byte[] TEXT_COMPRESSION_HEADER = + { (byte)0xFF, (byte)0XFE }; + /** owning table */ private final Table _table; /** For text columns, whether or not they are compressed */ @@ -266,6 +272,10 @@ public class Column implements Comparable<Column> { return _compressedUnicode; } + public void setCompressedUnicode(boolean newCompessedUnicode) { + _compressedUnicode = newCompessedUnicode; + } + public byte getPrecision() { return _precision; } @@ -357,6 +367,14 @@ public class Column implements Comparable<Column> { "Auto number column must be long integer"); } } + + if(isCompressedUnicode()) { + if((getType() != DataType.TEXT) && + (getType() != DataType.MEMO)) { + throw new IllegalArgumentException( + "Only textual columns allow unicode compression (text/memo)"); + } + } } /** @@ -949,7 +967,7 @@ public class Column implements Comparable<Column> { throw new IOException("Text is too big for column, max " + maxChars + ", got " + text.length()); } - byte[] encodedData = encodeUncompressedText(text, getFormat()).array(); + byte[] encodedData = encodeTextValue(text).array(); obj = encodedData; break; @@ -972,7 +990,7 @@ public class Column implements Comparable<Column> { // should already be "encoded" break; case MEMO: - obj = encodeUncompressedText(toCharSequence(obj), getFormat()).array(); + obj = encodeTextValue(toCharSequence(obj)).array(); break; default: throw new RuntimeException("unexpected var length, long value type: " + @@ -1052,8 +1070,9 @@ public class Column implements Comparable<Column> { // see if data is compressed. the 0xFF, 0xFE sequence indicates that // compression is used (sort of, see algorithm below) boolean isCompressed = ((data.length > 1) && - (data[0] == (byte)0xFF) && - (data[1] == (byte)0xFE)); + (data[0] == TEXT_COMPRESSION_HEADER[0]) && + (data[1] == TEXT_COMPRESSION_HEADER[1])); + if(isCompressed) { Expand expander = new Expand(); @@ -1063,7 +1082,7 @@ public class Column implements Comparable<Column> { // compressed mode) StringBuilder textBuf = new StringBuilder(data.length); // start after two bytes indicating compression use - int dataStart = 2; + int dataStart = TEXT_COMPRESSION_HEADER.length; int dataEnd = dataStart; boolean inCompressedMode = true; while(dataEnd < data.length) { @@ -1136,6 +1155,64 @@ public class Column implements Comparable<Column> { return format.CHARSET.decode(ByteBuffer.wrap(textBytes, startPos, length)); } + /** + * Encodes a text value, possibly compressing. + */ + private ByteBuffer encodeTextValue(CharSequence text) + throws IOException + { + // may only compress if column type allows it + if(isCompressedUnicode()) { + + // for now, only do very simple compression (only compress text which is + // all ascii text) + if(isAsciiCompressible(text)) { + + byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length + + text.length()]; + encodedChars[0] = TEXT_COMPRESSION_HEADER[0]; + encodedChars[1] = TEXT_COMPRESSION_HEADER[1]; + for(int i = 0; i < text.length(); ++i) { + encodedChars[i + TEXT_COMPRESSION_HEADER.length] = + (byte)text.charAt(i); + } + return ByteBuffer.wrap(encodedChars); + } + } + + return encodeUncompressedText(text, getFormat()); + } + + /** + * Returns {@code true} if the given text can be compressed using simple + * ASCII encoding, {@code false} otherwise. + */ + private static boolean isAsciiCompressible(CharSequence text) { + // only attempt to compress > 2 chars (compressing less than 3 chars would + // not result in a space savings due to the 2 byte compression header) + if(text.length() <= TEXT_COMPRESSION_HEADER.length) { + return false; + } + // now, see if it is all printable ASCII + for(int i = 0; i < text.length(); ++i) { + char c = text.charAt(i); + if(!isAsciiCrLfOrTab(c)) { + return false; + } + } + return true; + } + + /** + * Returns true if the character is ASCII, but not a control other than + * CR, LF and TAB + */ + private static boolean isAsciiCrLfOrTab(int ch) + { + return ((ch >= 0x20 && ch <= 0x7F) // ASCII (non control) + || ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB + } + @Override public String toString() { StringBuilder rtn = new StringBuilder(); diff --git a/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java b/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java index b2ec988..97fca0a 100644 --- a/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java +++ b/src/java/com/healthmarketscience/jackcess/ColumnBuilder.java @@ -48,6 +48,8 @@ public class ColumnBuilder { private Integer _scale; /** whether or not the column is auto-number */ private boolean _autoNumber; + /** whether or not the column allows compressed unicode */ + private Boolean _compressedUnicode; public ColumnBuilder(String name) { this(name, null); @@ -130,6 +132,14 @@ public class ColumnBuilder { } /** + * Sets whether of not the new column allows unicode compression. + */ + public ColumnBuilder setCompressedUnicode(boolean compressedUnicode) { + _compressedUnicode = compressedUnicode; + return this; + } + + /** * Sets all attributes except name from the given Column template. */ public ColumnBuilder setFromColumn(Column template) { @@ -164,6 +174,9 @@ public class ColumnBuilder { if(_autoNumber) { col.setAutoNumber(true); } + if(_compressedUnicode != null) { + col.setCompressedUnicode(_compressedUnicode); + } return col; } diff --git a/test/src/java/com/healthmarketscience/jackcess/TableTest.java b/test/src/java/com/healthmarketscience/jackcess/TableTest.java index 85b6bf7..de8de5c 100644 --- a/test/src/java/com/healthmarketscience/jackcess/TableTest.java +++ b/test/src/java/com/healthmarketscience/jackcess/TableTest.java @@ -27,8 +27,10 @@ King of Prussia, PA 19406 package com.healthmarketscience.jackcess; +import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import junit.framework.TestCase; @@ -37,37 +39,30 @@ import junit.framework.TestCase; * @author Tim McCune */ public class TableTest extends TestCase { + + private final PageChannel _pageChannel = new PageChannel(true); + private List<Column> _columns = new ArrayList<Column>(); + private Table _testTable; public TableTest(String name) { super(name); } public void testCreateRow() throws Exception { - final JetFormat format = JetFormat.VERSION_4; - final PageChannel pageChannel = new PageChannel(true); - List<Column> columns = new ArrayList<Column>(); - Column col = newTestColumn(pageChannel); + Column col = newTestColumn(); col.setType(DataType.INT); - columns.add(col); - col = newTestColumn(pageChannel); + _columns.add(col); + col = newTestColumn(); col.setType(DataType.TEXT); - columns.add(col); - col = newTestColumn(pageChannel); + _columns.add(col); + col = newTestColumn(); col.setType(DataType.TEXT); - columns.add(col); - Table table = new Table(true, columns) { - @Override - public PageChannel getPageChannel() { - return pageChannel; - } - }; - int colCount = 3; - Object[] row = new Object[colCount]; - row[0] = new Short((short) 9); - row[1] = "Tim"; - row[2] = "McCune"; - ByteBuffer buffer = table.createRow(row, format.MAX_ROW_SIZE, - pageChannel.createPageBuffer()); + _columns.add(col); + newTestTable(); + + int colCount = _columns.size(); + ByteBuffer buffer = createRow(9, "Tim", "McCune"); + assertEquals((short) colCount, buffer.getShort()); assertEquals((short) 9, buffer.getShort()); assertEquals((byte) 'T', buffer.get()); @@ -78,17 +73,107 @@ public class TableTest extends TestCase { assertEquals((byte) 7, buffer.get(30)); } - private static Column newTestColumn(final PageChannel pageChannel) { - return new Column(true) { + public void testUnicodeCompression() throws Exception { + Column col = newTestColumn(); + col = newTestColumn(); + col.setType(DataType.TEXT); + _columns.add(col); + col = newTestColumn(); + col.setType(DataType.MEMO); + _columns.add(col); + newTestTable(); + + String small = "this is a string"; + String smallNotAscii = "this is a string\0"; + String large = DatabaseTest.createString(30); + String largeNotAscii = large + "\0"; + + ByteBuffer[] buf1 = encodeColumns(small, large); + ByteBuffer[] buf2 = encodeColumns(smallNotAscii, largeNotAscii); + + for(Column tmp : _columns) { + tmp.setCompressedUnicode(true); + } + + ByteBuffer[] bufCmp1 = encodeColumns(small, large); + ByteBuffer[] bufCmp2 = encodeColumns(smallNotAscii, largeNotAscii); + + assertEquals(buf1[0].remaining(), + (bufCmp1[0].remaining() + small.length() - 2)); + assertEquals(buf1[1].remaining(), + (bufCmp1[1].remaining() + large.length() - 2)); + + for(int i = 0; i < buf2.length; ++i) { + assertTrue(Arrays.equals(toBytes(buf2[i]), toBytes(bufCmp2[i]))); + } + + assertEquals(Arrays.asList(small, large), + Arrays.asList(decodeColumns(bufCmp1))); + assertEquals(Arrays.asList(smallNotAscii, largeNotAscii), + Arrays.asList(decodeColumns(bufCmp2))); + + } + + private ByteBuffer createRow(Object... row) + throws IOException + { + return _testTable.createRow( + row, _testTable.getFormat().MAX_ROW_SIZE, + _testTable.getPageChannel().createPageBuffer()); + } + + private ByteBuffer[] encodeColumns(Object... row) + throws IOException + { + ByteBuffer[] result = new ByteBuffer[_columns.size()]; + for(int i = 0; i < _columns.size(); ++i) { + Column col = _columns.get(i); + result[i] = col.write(row[i], _testTable.getFormat().MAX_ROW_SIZE); + } + return result; + } + + private Object[] decodeColumns(ByteBuffer[] buffers) + throws IOException + { + Object[] result = new Object[_columns.size()]; + for(int i = 0; i < _columns.size(); ++i) { + Column col = _columns.get(i); + result[i] = col.read(toBytes(buffers[i])); + } + return result; + } + + private static byte[] toBytes(ByteBuffer buffer) { + buffer.rewind(); + byte[] b = new byte[buffer.remaining()]; + buffer.get(b); + return b; + } + + private Table newTestTable() + throws Exception + { + _testTable = new Table(true, _columns) { @Override public PageChannel getPageChannel() { - return pageChannel; + return _pageChannel; } @Override public JetFormat getFormat() { return JetFormat.VERSION_4; } }; + return _testTable; + } + + private Column newTestColumn() { + return new Column(true) { + @Override + public Table getTable() { + return _testTable; + } + }; } } |