- handle more text index entry types (extended charsets, alternate encodings)
* MEDIUM
-- determine when/when not to inline memo/ole data for a row of data depending
- on how much will be written in the current row
- * MEDIUM
- handle row "updates": write row data back to current row, handling overflow
* MEDIUM
- implement index creation
* EASY
- implement table, column, index deletion
* EASY - MEDIUM
-
+- implement more comprehensive unicode compression write support
Share out-of-line long value pages in order to generate more compact
database files.
</action>
+ <action dev="jahlborn" type="add">
+ Add primitive support for writing unicode compressed text columns.
+ </action>
</release>
<release version="1.1.15" date="2008-06-27">
<action dev="jahlborn" type="fix" issue="1998225">
/** mask for the unknown bit */
public static final byte UNKNOWN_FLAG_MASK = (byte)0x02;
-
+
+ /** pattern matching textual guid strings (allows for optional surrounding
+ '{' and '}') */
private static final Pattern GUID_PATTERN = Pattern.compile("\\s*[{]?([\\p{XDigit}]{8})-([\\p{XDigit}]{4})-([\\p{XDigit}]{4})-([\\p{XDigit}]{4})-([\\p{XDigit}]{12})[}]?\\s*");
+ /** header used to indicate unicode text compression */
+ private static final byte[] TEXT_COMPRESSION_HEADER =
+ { (byte)0xFF, (byte)0XFE };
+
/** owning table */
private final Table _table;
/** For text columns, whether or not they are compressed */
return _compressedUnicode;
}
+ public void setCompressedUnicode(boolean newCompessedUnicode) {
+ _compressedUnicode = newCompessedUnicode;
+ }
+
public byte getPrecision() {
return _precision;
}
"Auto number column must be long integer");
}
}
+
+ if(isCompressedUnicode()) {
+ if((getType() != DataType.TEXT) &&
+ (getType() != DataType.MEMO)) {
+ throw new IllegalArgumentException(
+ "Only textual columns allow unicode compression (text/memo)");
+ }
+ }
}
/**
throw new IOException("Text is too big for column, max " + maxChars
+ ", got " + text.length());
}
- byte[] encodedData = encodeUncompressedText(text, getFormat()).array();
+ byte[] encodedData = encodeTextValue(text).array();
obj = encodedData;
break;
// should already be "encoded"
break;
case MEMO:
- obj = encodeUncompressedText(toCharSequence(obj), getFormat()).array();
+ obj = encodeTextValue(toCharSequence(obj)).array();
break;
default:
throw new RuntimeException("unexpected var length, long value type: " +
// see if data is compressed. the 0xFF, 0xFE sequence indicates that
// compression is used (sort of, see algorithm below)
boolean isCompressed = ((data.length > 1) &&
- (data[0] == (byte)0xFF) &&
- (data[1] == (byte)0xFE));
+ (data[0] == TEXT_COMPRESSION_HEADER[0]) &&
+ (data[1] == TEXT_COMPRESSION_HEADER[1]));
+
if(isCompressed) {
Expand expander = new Expand();
// compressed mode)
StringBuilder textBuf = new StringBuilder(data.length);
// start after two bytes indicating compression use
- int dataStart = 2;
+ int dataStart = TEXT_COMPRESSION_HEADER.length;
int dataEnd = dataStart;
boolean inCompressedMode = true;
while(dataEnd < data.length) {
return format.CHARSET.decode(ByteBuffer.wrap(textBytes, startPos, length));
}
+ /**
+ * Encodes a text value, possibly compressing.
+ */
+ private ByteBuffer encodeTextValue(CharSequence text)
+ throws IOException
+ {
+ // may only compress if column type allows it
+ if(isCompressedUnicode()) {
+
+ // for now, only do very simple compression (only compress text which is
+ // all ascii text)
+ if(isAsciiCompressible(text)) {
+
+ byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length +
+ text.length()];
+ encodedChars[0] = TEXT_COMPRESSION_HEADER[0];
+ encodedChars[1] = TEXT_COMPRESSION_HEADER[1];
+ for(int i = 0; i < text.length(); ++i) {
+ encodedChars[i + TEXT_COMPRESSION_HEADER.length] =
+ (byte)text.charAt(i);
+ }
+ return ByteBuffer.wrap(encodedChars);
+ }
+ }
+
+ return encodeUncompressedText(text, getFormat());
+ }
+
+ /**
+ * Returns {@code true} if the given text can be compressed using simple
+ * ASCII encoding, {@code false} otherwise.
+ */
+ private static boolean isAsciiCompressible(CharSequence text) {
+ // only attempt to compress > 2 chars (compressing less than 3 chars would
+ // not result in a space savings due to the 2 byte compression header)
+ if(text.length() <= TEXT_COMPRESSION_HEADER.length) {
+ return false;
+ }
+ // now, see if it is all printable ASCII
+ for(int i = 0; i < text.length(); ++i) {
+ char c = text.charAt(i);
+ if(!isAsciiCrLfOrTab(c)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns true if the character is ASCII, but not a control other than
+ * CR, LF and TAB
+ */
+ private static boolean isAsciiCrLfOrTab(int ch)
+ {
+ return ((ch >= 0x20 && ch <= 0x7F) // ASCII (non control)
+ || ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB
+ }
+
@Override
public String toString() {
StringBuilder rtn = new StringBuilder();
private Integer _scale;
/** whether or not the column is auto-number */
private boolean _autoNumber;
+ /** whether or not the column allows compressed unicode */
+ private Boolean _compressedUnicode;
public ColumnBuilder(String name) {
this(name, null);
return this;
}
+ /**
+ * Sets whether of not the new column allows unicode compression.
+ */
+ public ColumnBuilder setCompressedUnicode(boolean compressedUnicode) {
+ _compressedUnicode = compressedUnicode;
+ return this;
+ }
+
/**
* Sets all attributes except name from the given Column template.
*/
if(_autoNumber) {
col.setAutoNumber(true);
}
+ if(_compressedUnicode != null) {
+ col.setCompressedUnicode(_compressedUnicode);
+ }
return col;
}
package com.healthmarketscience.jackcess;
+import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import junit.framework.TestCase;
* @author Tim McCune
*/
public class TableTest extends TestCase {
+
+ private final PageChannel _pageChannel = new PageChannel(true);
+ private List<Column> _columns = new ArrayList<Column>();
+ private Table _testTable;
public TableTest(String name) {
super(name);
}
public void testCreateRow() throws Exception {
- final JetFormat format = JetFormat.VERSION_4;
- final PageChannel pageChannel = new PageChannel(true);
- List<Column> columns = new ArrayList<Column>();
- Column col = newTestColumn(pageChannel);
+ Column col = newTestColumn();
col.setType(DataType.INT);
- columns.add(col);
- col = newTestColumn(pageChannel);
+ _columns.add(col);
+ col = newTestColumn();
col.setType(DataType.TEXT);
- columns.add(col);
- col = newTestColumn(pageChannel);
+ _columns.add(col);
+ col = newTestColumn();
col.setType(DataType.TEXT);
- columns.add(col);
- Table table = new Table(true, columns) {
- @Override
- public PageChannel getPageChannel() {
- return pageChannel;
- }
- };
- int colCount = 3;
- Object[] row = new Object[colCount];
- row[0] = new Short((short) 9);
- row[1] = "Tim";
- row[2] = "McCune";
- ByteBuffer buffer = table.createRow(row, format.MAX_ROW_SIZE,
- pageChannel.createPageBuffer());
+ _columns.add(col);
+ newTestTable();
+
+ int colCount = _columns.size();
+ ByteBuffer buffer = createRow(9, "Tim", "McCune");
+
assertEquals((short) colCount, buffer.getShort());
assertEquals((short) 9, buffer.getShort());
assertEquals((byte) 'T', buffer.get());
assertEquals((byte) 7, buffer.get(30));
}
- private static Column newTestColumn(final PageChannel pageChannel) {
- return new Column(true) {
+ public void testUnicodeCompression() throws Exception {
+ Column col = newTestColumn();
+ col = newTestColumn();
+ col.setType(DataType.TEXT);
+ _columns.add(col);
+ col = newTestColumn();
+ col.setType(DataType.MEMO);
+ _columns.add(col);
+ newTestTable();
+
+ String small = "this is a string";
+ String smallNotAscii = "this is a string\0";
+ String large = DatabaseTest.createString(30);
+ String largeNotAscii = large + "\0";
+
+ ByteBuffer[] buf1 = encodeColumns(small, large);
+ ByteBuffer[] buf2 = encodeColumns(smallNotAscii, largeNotAscii);
+
+ for(Column tmp : _columns) {
+ tmp.setCompressedUnicode(true);
+ }
+
+ ByteBuffer[] bufCmp1 = encodeColumns(small, large);
+ ByteBuffer[] bufCmp2 = encodeColumns(smallNotAscii, largeNotAscii);
+
+ assertEquals(buf1[0].remaining(),
+ (bufCmp1[0].remaining() + small.length() - 2));
+ assertEquals(buf1[1].remaining(),
+ (bufCmp1[1].remaining() + large.length() - 2));
+
+ for(int i = 0; i < buf2.length; ++i) {
+ assertTrue(Arrays.equals(toBytes(buf2[i]), toBytes(bufCmp2[i])));
+ }
+
+ assertEquals(Arrays.asList(small, large),
+ Arrays.asList(decodeColumns(bufCmp1)));
+ assertEquals(Arrays.asList(smallNotAscii, largeNotAscii),
+ Arrays.asList(decodeColumns(bufCmp2)));
+
+ }
+
+ private ByteBuffer createRow(Object... row)
+ throws IOException
+ {
+ return _testTable.createRow(
+ row, _testTable.getFormat().MAX_ROW_SIZE,
+ _testTable.getPageChannel().createPageBuffer());
+ }
+
+ private ByteBuffer[] encodeColumns(Object... row)
+ throws IOException
+ {
+ ByteBuffer[] result = new ByteBuffer[_columns.size()];
+ for(int i = 0; i < _columns.size(); ++i) {
+ Column col = _columns.get(i);
+ result[i] = col.write(row[i], _testTable.getFormat().MAX_ROW_SIZE);
+ }
+ return result;
+ }
+
+ private Object[] decodeColumns(ByteBuffer[] buffers)
+ throws IOException
+ {
+ Object[] result = new Object[_columns.size()];
+ for(int i = 0; i < _columns.size(); ++i) {
+ Column col = _columns.get(i);
+ result[i] = col.read(toBytes(buffers[i]));
+ }
+ return result;
+ }
+
+ private static byte[] toBytes(ByteBuffer buffer) {
+ buffer.rewind();
+ byte[] b = new byte[buffer.remaining()];
+ buffer.get(b);
+ return b;
+ }
+
+ private Table newTestTable()
+ throws Exception
+ {
+ _testTable = new Table(true, _columns) {
@Override
public PageChannel getPageChannel() {
- return pageChannel;
+ return _pageChannel;
}
@Override
public JetFormat getFormat() {
return JetFormat.VERSION_4;
}
};
+ return _testTable;
+ }
+
+ private Column newTestColumn() {
+ return new Column(true) {
+ @Override
+ public Table getTable() {
+ return _testTable;
+ }
+ };
}
}