From a0c7c8dc6840d10fc1aa3d27881ed76434b15ecb Mon Sep 17 00:00:00 2001 From: James Ahlborn Date: Sat, 15 Nov 2014 04:06:17 +0000 Subject: [PATCH] rework unicode compression support, fixes issue 111 git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@892 f203690c-595d-4dc9-a70b-905162fa7fd2 --- pom.xml | 4 - src/changes/changes.xml | 6 + .../jackcess/impl/ColumnImpl.java | 141 ++-- .../jackcess/impl/scsu/Compress.java | 628 ------------------ .../jackcess/impl/scsu/Debug.java | 151 ----- .../impl/scsu/EndOfInputException.java | 49 -- .../impl/scsu/EndOfOutputException.java | 48 -- .../jackcess/impl/scsu/Expand.java | 431 ------------ .../impl/scsu/IllegalInputException.java | 48 -- .../jackcess/impl/scsu/SCSU.java | 252 ------- src/test/data/V2003/testUnicodeCompV2003.mdb | Bin 0 -> 294912 bytes .../jackcess/DatabaseTest.java | 31 +- .../jackcess/impl/scsu/CompressMain.java | 574 ---------------- .../jackcess/impl/scsu/CompressTest.java | 47 -- 14 files changed, 99 insertions(+), 2311 deletions(-) delete mode 100644 src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java delete mode 100644 src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java delete mode 100644 src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java delete mode 100644 src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java delete mode 100644 src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java delete mode 100644 src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java delete mode 100644 src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java create mode 100644 src/test/data/V2003/testUnicodeCompV2003.mdb delete mode 100644 src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressMain.java delete mode 100644 src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressTest.java diff --git a/pom.xml b/pom.xml index 1016d38..e24a0ac 100644 --- a/pom.xml +++ b/pom.xml @@ -160,9 +160,6 @@ cobertura-maven-plugin - - com/healthmarketscience/jackcess/impl/scsu/** - @@ -269,7 +266,6 @@ http://docs.oracle.com/javaee/5/api/ 1.5 - com.healthmarketscience.jackcess.impl.scsu public ${basedir}/src/site/javadoc/stylesheet.css diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 37d98a9..22ab6ae 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -4,6 +4,12 @@ Tim McCune + + + Unicode compression support was not correct for all possibly + compressed characters. + + IndexCursor can early exit when searching based on indexed values. diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java b/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java index 5e3fe88..224348a 100644 --- a/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java +++ b/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java @@ -60,10 +60,6 @@ import com.healthmarketscience.jackcess.complex.ComplexColumnInfo; import com.healthmarketscience.jackcess.complex.ComplexValue; import com.healthmarketscience.jackcess.complex.ComplexValueForeignKey; import com.healthmarketscience.jackcess.impl.complex.ComplexValueForeignKeyImpl; -import com.healthmarketscience.jackcess.impl.scsu.Compress; -import com.healthmarketscience.jackcess.impl.scsu.EndOfInputException; -import com.healthmarketscience.jackcess.impl.scsu.Expand; -import com.healthmarketscience.jackcess.impl.scsu.IllegalInputException; import com.healthmarketscience.jackcess.util.ColumnValidator; import com.healthmarketscience.jackcess.util.SimpleColumnValidator; import org.apache.commons.lang.builder.ToStringBuilder; @@ -163,6 +159,8 @@ public class ColumnImpl implements Column, Comparable { /** header used to indicate unicode text compression */ private static final byte[] TEXT_COMPRESSION_HEADER = { (byte)0xFF, (byte)0XFE }; + private static final char MIN_COMPRESS_CHAR = 1; + private static final char MAX_COMPRESS_CHAR = 0xFF; /** owning table */ @@ -1110,57 +1108,44 @@ public class ColumnImpl implements Column, Comparable { String decodeTextValue(byte[] data) throws IOException { - try { - - // see if data is compressed. the 0xFF, 0xFE sequence indicates that - // compression is used (sort of, see algorithm below) - boolean isCompressed = ((data.length > 1) && - (data[0] == TEXT_COMPRESSION_HEADER[0]) && - (data[1] == TEXT_COMPRESSION_HEADER[1])); - - if(isCompressed) { + // see if data is compressed. the 0xFF, 0xFE sequence indicates that + // compression is used (sort of, see algorithm below) + boolean isCompressed = ((data.length > 1) && + (data[0] == TEXT_COMPRESSION_HEADER[0]) && + (data[1] == TEXT_COMPRESSION_HEADER[1])); - Expand expander = new Expand(); + if(isCompressed) { - // this is a whacky compression combo that switches back and forth - // between compressed/uncompressed using a 0x00 byte (starting in - // compressed mode) - StringBuilder textBuf = new StringBuilder(data.length); - // start after two bytes indicating compression use - int dataStart = TEXT_COMPRESSION_HEADER.length; - int dataEnd = dataStart; - boolean inCompressedMode = true; - while(dataEnd < data.length) { - if(data[dataEnd] == (byte)0x00) { - - // handle current segment - decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, - expander, textBuf); - inCompressedMode = !inCompressedMode; - ++dataEnd; - dataStart = dataEnd; + // this is a whacky compression combo that switches back and forth + // between compressed/uncompressed using a 0x00 byte (starting in + // compressed mode) + StringBuilder textBuf = new StringBuilder(data.length); + // start after two bytes indicating compression use + int dataStart = TEXT_COMPRESSION_HEADER.length; + int dataEnd = dataStart; + boolean inCompressedMode = true; + while(dataEnd < data.length) { + if(data[dataEnd] == (byte)0x00) { + + // handle current segment + decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, + textBuf); + inCompressedMode = !inCompressedMode; + ++dataEnd; + dataStart = dataEnd; - } else { - ++dataEnd; - } + } else { + ++dataEnd; } - // handle last segment - decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, - expander, textBuf); + } + // handle last segment + decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, textBuf); - return textBuf.toString(); + return textBuf.toString(); - } - - return decodeUncompressedText(data, getCharset()); - - } catch (IllegalInputException e) { - throw (IOException) - new IOException("Can't expand text column").initCause(e); - } catch (EndOfInputException e) { - throw (IOException) - new IOException("Can't expand text column").initCause(e); } + + return decodeUncompressedText(data, getCharset()); } /** @@ -1168,25 +1153,29 @@ public class ColumnImpl implements Column, Comparable { * given status of the segment (compressed/uncompressed). */ private void decodeTextSegment(byte[] data, int dataStart, int dataEnd, - boolean inCompressedMode, Expand expander, + boolean inCompressedMode, StringBuilder textBuf) - throws IllegalInputException, EndOfInputException { if(dataEnd <= dataStart) { // no data return; } int dataLength = dataEnd - dataStart; + if(inCompressedMode) { - // handle compressed data - byte[] tmpData = ByteUtil.copyOf(data, dataStart, dataLength); - expander.reset(); - textBuf.append(expander.expand(tmpData)); - } else { - // handle uncompressed data - textBuf.append(decodeUncompressedText(data, dataStart, dataLength, - getCharset())); + byte[] tmpData = new byte[dataLength * 2]; + int tmpIdx = 0; + for(int i = dataStart; i < dataEnd; ++i) { + tmpData[tmpIdx] = data[i]; + tmpIdx += 2; + } + data = tmpData; + dataStart = 0; + dataLength = data.length; } + + textBuf.append(decodeUncompressedText(data, dataStart, dataLength, + getCharset())); } /** @@ -1215,41 +1204,37 @@ public class ColumnImpl implements Column, Comparable { // may only compress if column type allows it if(!forceUncompressed && isCompressedUnicode() && - (text.length() <= getFormat().MAX_COMPRESSED_UNICODE_SIZE)) { - - // for now, only do very simple compression (only compress text which is - // all ascii text) - if(isAsciiCompressible(text)) { - - byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length + - text.length()]; - encodedChars[0] = TEXT_COMPRESSION_HEADER[0]; - encodedChars[1] = TEXT_COMPRESSION_HEADER[1]; - for(int i = 0; i < text.length(); ++i) { - encodedChars[i + TEXT_COMPRESSION_HEADER.length] = - (byte)text.charAt(i); - } - return ByteBuffer.wrap(encodedChars); + (text.length() <= getFormat().MAX_COMPRESSED_UNICODE_SIZE) && + isUnicodeCompressible(text)) { + + byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length + + text.length()]; + encodedChars[0] = TEXT_COMPRESSION_HEADER[0]; + encodedChars[1] = TEXT_COMPRESSION_HEADER[1]; + for(int i = 0; i < text.length(); ++i) { + encodedChars[i + TEXT_COMPRESSION_HEADER.length] = + (byte)text.charAt(i); } + return ByteBuffer.wrap(encodedChars); } return encodeUncompressedText(text, getCharset()); } /** - * Returns {@code true} if the given text can be compressed using simple - * ASCII encoding, {@code false} otherwise. + * Returns {@code true} if the given text can be compressed using compressed + * unicode, {@code false} otherwise. */ - private static boolean isAsciiCompressible(CharSequence text) { + private static boolean isUnicodeCompressible(CharSequence text) { // only attempt to compress > 2 chars (compressing less than 3 chars would // not result in a space savings due to the 2 byte compression header) if(text.length() <= TEXT_COMPRESSION_HEADER.length) { return false; } - // now, see if it is all printable ASCII + // now, see if it is all compressible characters for(int i = 0; i < text.length(); ++i) { char c = text.charAt(i); - if(!Compress.isAsciiCrLfOrTab(c)) { + if((c < MIN_COMPRESS_CHAR) || (c > MAX_COMPRESS_CHAR)) { return false; } } diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java deleted file mode 100644 index 9428075..0000000 --- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java +++ /dev/null @@ -1,628 +0,0 @@ -package com.healthmarketscience.jackcess.impl.scsu; - -/** - * This sample software accompanies Unicode Technical Report #6 and - * distributed as is by Unicode, Inc., subject to the following: - * - * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software - * without fee is hereby granted provided that this copyright notice - * appears in all copies. - * - * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE - * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. - * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND - * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND - * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING - * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. - * - * @author Asmus Freytag - * - * @version 001 Dec 25 1996 - * @version 002 Jun 25 1997 - * @version 003 Jul 25 1997 - * @version 004 Aug 25 1997 - * - * Unicode and the Unicode logo are trademarks of Unicode, Inc., - * and are registered in some jurisdictions. - **/ - -/** - This class implements a simple compression algorithm - **/ -/* - Note on exception handling - This compressor is designed so that it can be restarted after - an exception. All operations advancing input and/or output cursor - (iIn and iOut) either complete an action, or set a state (fUnicodeMode) - before updating the cursors. -*/ -public class Compress extends SCSU -{ - - /** next input character to be read **/ - private int iIn; - - /** next output byte to be written **/ - private int iOut; - - /** start index of Unicode mode in output array, or -1 if in single byte mode **/ - private int iSCU = -1; - - /** true if the next command byte is of the Uxx family */ - private boolean fUnicodeMode = false; - - /** locate a window for a character given a table of offsets - @param ch - character - @param offsetTable - table of window offsets - @return true if the character fits a window from the table of windows */ - private boolean locateWindow(int ch, int[] offsetTable) - { - // always try the current window first - int iWin = getCurrentWindow(); - - // if the character fits the current window - // just use the current window - if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) - { - return true; - } - - // try all windows in order - for (iWin = 0; iWin < offsetTable.length; iWin++) - { - if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) - { - selectWindow(iWin); - return true; - } - } - // none found - return false; - } - - /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */ - public static boolean isAsciiCrLfOrTab(int ch) - { - return (ch >= 0x20 && ch <= 0x7F) // ASCII - || ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB - - } - - /** output a run of characters in single byte mode - In single byte mode pass through characters in the ASCII range, but - quote characters overlapping with compression command codes. Runs - of characters fitting the current window are output as runs of bytes - in the range 0x80-0xFF. Checks for and validates Surrogate Pairs. - Uses and updates the current input and output cursors store in - the instance variables iIn and iOut. - @param in - input character array - @param out - output byte array - @return the next chaacter to be processed. This may be an extended character. - **/ - @SuppressWarnings("fallthrough") - public int outputSingleByteRun(char [] in, byte [] out) - throws EndOfOutputException, EndOfInputException, IllegalInputException - { - int iWin = getCurrentWindow(); - while(iIn < in.length) - { - int outlen = 0; - byte byte1 = 0; - byte byte2 = 0; - - // get the input character - int ch = in[iIn]; - - int inlen = 1; - - // Check input for Surrogate pair - if ( (ch & 0xF800) == 0xD800 ) - { - if ( (ch & 0xFC00) == 0xDC00 ) - { - // low surrogate out of order - throw new IllegalInputException("Unpaired low surrogate: "+iIn); - } - else - { - // have high surrogate now get low surrogate - if ( iIn >= in.length-1) - { - // premature end of input - throw new EndOfInputException(); - } - // get the char - int ch2 = in[iIn+1]; - - // make sure it's a low surrogate - if ( (ch2 & 0xFC00) != 0xDC00 ) - { - // a low surrogate was required - throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1)); - } - - // combine the two values - ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; - // ch = ch<<10 + ch2 - 0x36F0000; - - inlen = 2; - } - } - - // ASCII Letter, NUL, CR, LF and TAB are always passed through - if (isAsciiCrLfOrTab(ch) || ch == 0) - { - // pass through directcly - byte2 = (byte)(ch & 0x7F); - outlen = 1; - } - - // All other control codes must be quoted - else if (ch < 0x20) - { - byte1 = SQ0; - byte2 = (byte)(ch); - outlen = 2; - } - - // Letters that fit the current dynamic window - else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) - { - ch -= dynamicOffset[iWin]; - byte2 = (byte)(ch | 0x80); - outlen = 1; - } - - // check for room in the output array - if (iOut + outlen >= out.length) - { - throw new EndOfOutputException(); - } - - switch(outlen) - { - default: - // need to use some other compression mode for this - // character so we terminate this loop - - return ch; // input not finished - - // output the characters - case 2: - out[iOut++] = byte1; - // fall through - case 1: - out[iOut++] = byte2; - break; - } - // advance input pointer - iIn += inlen; - } - return 0; // input all used up - } - - /** quote a single character in single byte mode - Quoting a character (aka 'non-locking shift') gives efficient access - to characters that occur in isolation--usually punctuation characters. - When quoting a character from a dynamic window use 0x80 - 0xFF, when - quoting a character from a static window use 0x00-0x7f. - @param ch - character to be quoted - @param out - output byte array - **/ - - private void quoteSingleByte(int ch, byte [] out) - throws EndOfOutputException - { - Debug.out("Quoting SingleByte ", ch); - int iWin = getCurrentWindow(); - - // check for room in the output array - if (iOut >= out.length -2) - { - throw new EndOfOutputException(); - } - - // Output command byte followed by - out[iOut++] = (byte)(SQ0 + iWin); - - // Letter that fits the current dynamic window - if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) - { - ch -= dynamicOffset[iWin]; - out[iOut++] = (byte)(ch | 0x80); - } - - // Letter that fits the current static window - else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) - { - ch -= staticOffset[iWin]; - out[iOut++] = (byte)ch; - } - else - { - throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error"); - } - // advance input pointer - iIn ++; - Debug.out("New input: ", iIn); - } - - /** output a run of characters in Unicode mode - A run of Unicode mode consists of characters which are all in the - range of non-compressible characters or isolated occurrence - of any other characters. Characters in the range 0xE00-0xF2FF must - be quoted to avoid overlap with the Unicode mode compression command codes. - Uses and updates the current input and output cursors store in - the instance variables iIn and iOut. - NOTE: Characters from surrogate pairs are passed through and unlike single - byte mode no checks are made for unpaired surrogate characters. - @param in - input character array - @param out - output byte array - @return the next input character to be processed - **/ - public char outputUnicodeRun(char [] in, byte [] out) - throws EndOfOutputException - { - // current character - char ch = 0; - - while(iIn < in.length) - { - // get current input and set default output length - ch = in[iIn]; - int outlen = 2; - - // Characters in these ranges could potentially be compressed. - // We require 2 or more compressible characters to break the run - if (isCompressible(ch)) - { - // check whether we can look ahead - if( iIn < in.length - 1) - { - // DEBUG - Debug.out("is-comp: ",ch); - char ch2 = in[iIn + 1]; - if (isCompressible(ch2)) - { - // at least 2 characters are compressible - // break the run - break; - } - //DEBUG - Debug.out("no-comp: ",ch2); - } - // If we get here, the current character is only character - // left in the input or it is followed by a non-compressible - // character. In neither case do we gain by breaking the - // run, so we proceed to output the character. - if (ch >= 0xE000 && ch <= 0xF2FF) - { - // Characters in this range need to be escaped - outlen = 3; - } - - } - // check that there is enough room to output the character - if(iOut >= out.length - outlen) - { - // DEBUG - Debug.out("End of Output @", iOut); - // if we got here, we ran out of space in the output array - throw new EndOfOutputException(); - } - - // output any characters that cannot be compressed, - if (outlen == 3) - { - // output the quote character - out[iOut++] = UQU; - } - // pass the Unicode character in MSB,LSB order - out[iOut++] = (byte)(ch >>> 8); - out[iOut++] = (byte)(ch & 0xFF); - - // advance input cursor - iIn++; - } - - // return the last character - return ch; - } - - static int iNextWindow = 3; - - /** redefine a window so it surrounds a given character value - For now, this function uses window 3 exclusively (window 4 - for extended windows); - @return true if a window was successfully defined - @param ch - character around which window is positioned - @param out - output byte array - @param fCurUnicodeMode - type of window - **/ - private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode) - throws IllegalInputException, EndOfOutputException - { - int iWin = iNextWindow % 8; // simple LRU - int iPosition = 0; - - // iPosition 0 is a reserved value - if (ch < 0x80) - { - throw new IllegalStateException("ch < 0x80"); - //return false; - } - - // Check the fixed offsets - for (int i = 0; i < fixedOffset.length; i++) - { - if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) - { - iPosition = i; - break; - } - } - - if (iPosition != 0) - { - // DEBUG - Debug.out("FIXED position is ", iPosition + 0xF9); - - // ch fits in a fixed offset window position - dynamicOffset[iWin] = fixedOffset[iPosition]; - iPosition += 0xF9; - } - else if (ch < 0x3400) - { - // calculate a window position command and set the offset - iPosition = ch >>> 7; - dynamicOffset[iWin] = ch & 0xFF80; - - Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); - } - else if (ch < 0xE000) - { - // attempt to place a window where none can go - return false; - } - else if (ch <= 0xFFFF) - { - // calculate a window position command, accounting - // for the gap in position values, and set the offset - iPosition = ((ch - gapOffset)>>> 7); - - dynamicOffset[iWin] = ch & 0xFF80; - - Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); - } - else - { - // if we get here, the character is in the extended range. - // Always use Window 4 to define an extended window - - iPosition = (ch - 0x10000) >>> 7; - // DEBUG - Debug.out("Try position Window at ", iPosition); - - iPosition |= iWin << 13; - dynamicOffset[iWin] = ch & 0x1FFF80; - } - - // Outputting window defintion command for the general cases - if ( iPosition < 0x100 && iOut < out.length-1) - { - out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin); - out[iOut++] = (byte) (iPosition & 0xFF); - } - // Output an extended window definiton command - else if ( iPosition >= 0x100 && iOut < out.length - 2) - { - - Debug.out("Setting extended window at ", iPosition); - out[iOut++] = (fCurUnicodeMode ? UDX : SDX); - out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF); - out[iOut++] = (byte) (iPosition & 0xFF); - } - else - { - throw new EndOfOutputException(); - } - selectWindow(iWin); - iNextWindow++; - return true; - } - - /** - compress a Unicode character array with some simplifying assumptions - **/ - public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut) - throws IllegalInputException, EndOfInputException, EndOfOutputException - { - iIn = iStartIn; - iOut = iStartOut; - - - while (iIn < in.length) - { - int ch; - - // previously we switched to a Unicode run - if (iSCU != -1) - { - - Debug.out("Remaining", in, iIn); - Debug.out("Output until ["+iOut+"]: ", out); - - // output characters as Unicode - ch = outputUnicodeRun(in, out); - - // for single character Unicode runs (3 bytes) use quote - if (iOut - iSCU == 3 ) - { - // go back and fix up the SCU to an SQU instead - out[iSCU] = SQU; - iSCU = -1; - continue; - } - else - { - iSCU = -1; - fUnicodeMode = true; - } - } - // next, try to output characters as single byte run - else - { - ch = outputSingleByteRun(in, out); - } - - // check whether we still have input - if (iIn == in.length) - { - break; // no more input - } - - // if we get here, we have a consistent value for ch, whether or - // not it is an regular or extended character. Locate or define a - // Window for the current character - - Debug.out("Output so far: ", out); - Debug.out("Routing ch="+ch+" for Input", in, iIn); - - // Check that we have enough room to output the command byte - if (iOut >= out.length - 1) - { - throw new EndOfOutputException(); - } - - // In order to switch away from Unicode mode, it is necessary - // to select (or define) a window. If the characters that follow - // the Unicode range are ASCII characters, we can't use them - // to decide which window to select, since ASCII characters don't - // influence window settings. This loop looks ahead until it finds - // one compressible character that isn't in the ASCII range. - for (int ich = iIn; ch < 0x80; ich++) - { - if (ich == in.length || !isCompressible(in[ich])) - { - // if there are only ASCII characters left, - ch = in[iIn]; - break; - } - ch = in[ich]; // lookahead for next non-ASCII char - } - // The character value contained in ch here will only be used to select - // output modes. Actual output of characters starts with in[iIn] and - // only takes place near the top of the loop. - - int iprevWindow = getCurrentWindow(); - - // try to locate a dynamic window - if (ch < 0x80 || locateWindow(ch, dynamicOffset)) - { - Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1); - // lookahead to use SQn instead of SCn for single - // character interruptions of runs in current window - if(!fUnicodeMode && iIn < in.length -1) - { - char ch2 = in[iIn+1]; - if (ch2 >= dynamicOffset[iprevWindow] && - ch2 < dynamicOffset[iprevWindow] + 0x80) - { - quoteSingleByte(ch, out); - selectWindow(iprevWindow); - continue; - } - } - - out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow()); - fUnicodeMode = false; - } - // try to locate a static window - else if (!fUnicodeMode && locateWindow(ch, staticOffset)) - { - // static windows are not accessible from Unicode mode - Debug.out("located a static window", getCurrentWindow()); - quoteSingleByte(ch, out); - selectWindow(iprevWindow); // restore current Window settings - continue; - } - // try to define a window around ch - else if (positionWindow(ch, out, fUnicodeMode) ) - { - fUnicodeMode = false; - } - // If all else fails, start a Unicode run - else - { - iSCU = iOut; - out[iOut++] = SCU; - continue; - } - } - - return iOut - iStartOut; - } - - public byte[] compress(String inStr) - throws IllegalInputException, EndOfInputException - { - // Running out of room for output can cause non-optimal - // compression. In order to not slow down compression too - // much, not all intermediate state is constantly saved. - - byte [] out = new byte[inStr.length() * 2]; - char [] in = inStr.toCharArray(); - //DEBUG - Debug.out("compress input: ",in); - reset(); - while(true) - { - try - { - simpleCompress(in, charsRead(), out, bytesWritten()); - // if we get here things went fine. - break; - } - catch (EndOfOutputException e) - { - // create a larger output buffer and continue - byte [] largerOut = new byte[out.length * 2]; - System.arraycopy(out, 0, largerOut, 0, out.length); - out = largerOut; - } - } - byte [] trimmedOut = new byte[bytesWritten()]; - System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length); - out = trimmedOut; - - Debug.out("compress output: ", out); - return out; - } - - /** reset is only needed to bail out after an exception and - restart with new input */ - @Override - public void reset() - { - super.reset(); - fUnicodeMode = false; - iSCU = - 1; - } - - /** returns the number of bytes written **/ - public int bytesWritten() - { - return iOut; - } - - /** returns the number of bytes written **/ - public int charsRead() - { - return iIn; - } - -} diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java deleted file mode 100644 index c973765..0000000 --- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java +++ /dev/null @@ -1,151 +0,0 @@ -package com.healthmarketscience.jackcess.impl.scsu; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/* - * This sample software accompanies Unicode Technical Report #6 and - * distributed as is by Unicode, Inc., subject to the following: - * - * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software - * without fee is hereby granted provided that this copyright notice - * appears in all copies. - * - * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE - * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. - * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND - * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND - * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING - * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. - * - * @author Asmus Freytag - * - * @version 001 Dec 25 1996 - * @version 002 Jun 25 1997 - * @version 003 Jul 25 1997 - * @version 004 Aug 25 1997 - * - * Unicode and the Unicode logo are trademarks of Unicode, Inc., - * and are registered in some jurisdictions. - **/ - -/** - * A number of helpful output routines for debugging. Output can be - * centrally enabled or disabled by calling Debug.set(true/false); - * All methods are statics; - */ - -public class Debug -{ - - private static final Log LOG = LogFactory.getLog(Debug.class); - - // debugging helper - public static void out(char [] chars) - { - out(chars, 0); - } - - public static void out(char [] chars, int iStart) - { - if (!LOG.isDebugEnabled()) return; - StringBuilder msg = new StringBuilder(); - - for (int i = iStart; i < chars.length; i++) - { - if (chars[i] >= 0 && chars[i] <= 26) - { - msg.append("^"+(char)(chars[i]+0x40)); - } - else if (chars[i] <= 255) - { - msg.append(chars[i]); - } - else - { - msg.append("\\u"+Integer.toString(chars[i],16)); - } - } - LOG.debug(msg.toString()); - } - - public static void out(byte [] bytes) - { - out(bytes, 0); - } - public static void out(byte [] bytes, int iStart) - { - if (!LOG.isDebugEnabled()) return; - StringBuilder msg = new StringBuilder(); - - for (int i = iStart; i < bytes.length; i++) - { - msg.append(bytes[i]+","); - } - LOG.debug(msg.toString()); - } - - public static void out(String str) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(str); - } - - public static void out(String msg, int iData) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(msg + iData); - } - public static void out(String msg, char ch) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(msg + "[U+"+Integer.toString(ch,16)+"]" + ch); - } - public static void out(String msg, byte bData) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(msg + bData); - } - public static void out(String msg, String str) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(msg + str); - } - public static void out(String msg, char [] data) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(msg); - out(data); - } - public static void out(String msg, byte [] data) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(msg); - out(data); - } - public static void out(String msg, char [] data, int iStart) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(msg +"("+iStart+"): "); - out(data, iStart); - } - public static void out(String msg, byte [] data, int iStart) - { - if (!LOG.isDebugEnabled()) return; - - LOG.debug(msg+"("+iStart+"): "); - out(data, iStart); - } -} diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java deleted file mode 100644 index b3148a7..0000000 --- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java +++ /dev/null @@ -1,49 +0,0 @@ -package com.healthmarketscience.jackcess.impl.scsu; - -/** - * This sample software accompanies Unicode Technical Report #6 and - * distributed as is by Unicode, Inc., subject to the following: - * - * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software - * without fee is hereby granted provided that this copyright notice - * appears in all copies. - * - * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE - * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. - * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND - * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND - * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING - * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. - * - * @author Asmus Freytag - * - * @version 001 Dec 25 1996 - * @version 002 Jun 25 1997 - * @version 003 Jul 25 1997 - * @version 004 Aug 25 1997 - * - * Unicode and the Unicode logo are trademarks of Unicode, Inc., - * and are registered in some jurisdictions. - **/ -/** - * The input string or input byte array ended prematurely - * - */ -public class EndOfInputException - extends java.lang.Exception -{ - - private static final long serialVersionUID = 1L; - - public EndOfInputException(){ - super("The input string or input byte array ended prematurely"); - } - - public EndOfInputException(String s) { - super(s); - } -} diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java deleted file mode 100644 index 94f5be6..0000000 --- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.healthmarketscience.jackcess.impl.scsu; - -/** - * This sample software accompanies Unicode Technical Report #6 and - * distributed as is by Unicode, Inc., subject to the following: - * - * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software - * without fee is hereby granted provided that this copyright notice - * appears in all copies. - * - * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE - * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. - * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND - * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND - * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING - * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. - * - * @author Asmus Freytag - * - * @version 001 Dec 25 1996 - * @version 002 Jun 25 1997 - * @version 003 Jul 25 1997 - * - * Unicode and the Unicode logo are trademarks of Unicode, Inc., - * and are registered in some jurisdictions. - **/ -/** - * The input string or input byte array ended prematurely - */ -public class EndOfOutputException - extends java.lang.Exception - -{ - - private static final long serialVersionUID = 1L; - - public EndOfOutputException(){ - super("The input string or input byte array ended prematurely"); - } - - public EndOfOutputException(String s) { - super(s); - } -} diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java deleted file mode 100644 index 378ca2f..0000000 --- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java +++ /dev/null @@ -1,431 +0,0 @@ -package com.healthmarketscience.jackcess.impl.scsu; - -/* - * This sample software accompanies Unicode Technical Report #6 and - * distributed as is by Unicode, Inc., subject to the following: - * - * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software - * without fee is hereby granted provided that this copyright notice - * appears in all copies. - * - * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE - * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. - * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND - * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND - * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING - * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. - * - * @author Asmus Freytag - * - * @version 001 Dec 25 1996 - * @version 002 Jun 25 1997 - * @version 003 Jul 25 1997 - * @version 004 Aug 25 1997 - * @version 005 Sep 30 1998 - * - * Unicode and the Unicode logo are trademarks of Unicode, Inc., - * and are registered in some jurisdictions. - **/ - - /** - Reference decoder for the Standard Compression Scheme for Unicode (SCSU) - -

Notes on the Java implementation

- - A limitation of Java is the exclusive use of a signed byte data type. - The following work arounds are required: - - Copying a byte to an integer variable and adding 256 for 'negative' - bytes gives an integer in the range 0-255. - - Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on - char values is unsigned. - - Extended characters require an int to store them. The sign is not an - issue because only 1024*1024 + 65536 extended characters exist. - -**/ -public class Expand extends SCSU -{ - /** (re-)define (and select) a dynamic window - A sliding window position cannot start at any Unicode value, - so rather than providing an absolute offset, this function takes - an index value which selects among the possible starting values. - - Most scripts in Unicode start on or near a half-block boundary - so the default behaviour is to multiply the index by 0x80. Han, - Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF - show very poor locality--therefore no sliding window can be set - there. A jumpOffset is added to the index value to skip that region, - and only 167 index values total are required to select all eligible - half-blocks. - - Finally, a few scripts straddle half block boundaries. For them, a - table of fixed offsets is used, and the index values from 0xF9 to - 0xFF are used to select these special offsets. - - After (re-)defining a windows location it is selected so it is ready - for use. - - Recall that all Windows are of the same length (128 code positions). - - @param iWindow - index of the window to be (re-)defined - @param bOffset - index for the new offset value - **/ - // @005 protected <-- private here and elsewhere - protected void defineWindow(int iWindow, byte bOffset) - throws IllegalInputException - { - int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset); - - // 0 is a reserved value - if (iOffset == 0) - { - throw new IllegalInputException(); - } - else if (iOffset < gapThreshold) - { - dynamicOffset[iWindow] = iOffset << 7; - } - else if (iOffset < reservedStart) - { - dynamicOffset[iWindow] = (iOffset << 7) + gapOffset; - } - else if (iOffset < fixedThreshold) - { - // more reserved values - throw new IllegalInputException("iOffset == "+iOffset); - } - else - { - dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold]; - } - - // make the redefined window the active one - selectWindow(iWindow); - } - - /** (re-)define (and select) a window as an extended dynamic window - The surrogate area in Unicode allows access to 2**20 codes beyond the - first 64K codes by combining one of 1024 characters from the High - Surrogate Area with one of 1024 characters from the Low Surrogate - Area (see Unicode 2.0 for the details). - - The tags SDX and UDX set the window such that each subsequent byte in - the range 80 to FF represents a surrogate pair. The following diagram - shows how the bits in the two bytes following the SDX or UDX, and a - subsequent data byte, map onto the bits in the resulting surrogate pair. - - hbyte lbyte data - nnnwwwww zzzzzyyy 1xxxxxxx - - high-surrogate low-surrogate - 110110wwwwwzzzzz 110111yyyxxxxxxx - - @param chOffset - Since the three top bits of chOffset are not needed to - set the location of the extended Window, they are used instead - to select the window, thereby reducing the number of needed command codes. - The bottom 13 bits of chOffset are used to calculate the offset relative to - a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair. - **/ - protected void defineExtendedWindow(char chOffset) - { - // The top 3 bits of iOffsetHi are the window index - int iWindow = chOffset >>> 13; - - // Calculate the new offset - dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16); - - // make the redefined window the active one - selectWindow(iWindow); - } - - /** string buffer length used by the following functions */ - protected int iOut = 0; - - /** input cursor used by the following functions */ - protected int iIn = 0; - - /** expand input that is in Unicode mode - @param in input byte array to be expanded - @param iCur starting index - @param sb string buffer to which to append expanded input - @return the index for the lastc byte processed - **/ - protected int expandUnicode(byte []in, int iCur, StringBuilder sb) - throws IllegalInputException, EndOfInputException - { - for( ; iCur < in.length-1; iCur+=2 ) // step by 2: - { - byte b = in[iCur]; - - if (b >= UC0 && b <= UC7) - { - Debug.out("SelectWindow: ", b); - selectWindow(b - UC0); - return iCur; - } - else if (b >= UD0 && b <= UD7) - { - defineWindow( b - UD0, in[iCur+1]); - return iCur + 1; - } - else if (b == UDX) - { - if( iCur >= in.length - 2) - { - break; // buffer error - } - defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2])); - return iCur + 2; - } - else if (b == UQU) - { - if( iCur >= in.length - 2) - { - break; // error - } - // Skip command byte and output Unicode character - iCur++; - } - - // output a Unicode character - char ch = charFromTwoBytes(in[iCur], in[iCur+1]); - sb.append(ch); - iOut++; - } - - if( iCur == in.length) - { - return iCur; - } - - // Error condition - throw new EndOfInputException(); - } - - /** assemble a char from two bytes - In Java bytes are signed quantities, while chars are unsigned - @return the character - @param hi most significant byte - @param lo least significant byte - */ - public static char charFromTwoBytes(byte hi, byte lo) - { - char ch = (char)(lo >= 0 ? lo : 256 + lo); - return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8)); - } - - /** expand portion of the input that is in single byte mode **/ - @SuppressWarnings("fallthrough") - protected String expandSingleByte(byte []in) - throws IllegalInputException, EndOfInputException - { - - /* Allocate the output buffer. Because of control codes, generally - each byte of input results in fewer than one character of - output. Using in.length as an intial allocation length should avoid - the need to reallocate in mid-stream. The exception to this rule are - surrogates. */ - StringBuilder sb = new StringBuilder(in.length); - iOut = 0; - - // Loop until all input is exhausted or an error occurred - int iCur; - Loop: - for( iCur = 0; iCur < in.length; iCur++ ) - { - // DEBUG Debug.out("Expanding: ", iCur); - - // Default behaviour is that ASCII characters are passed through - // (staticOffset[0] == 0) and characters with the high bit on are - // offset by the current dynamic (or sliding) window (this.iWindow) - int iStaticWindow = 0; - int iDynamicWindow = getCurrentWindow(); - - switch(in[iCur]) - { - // Quote from a static Window - case SQ0: - case SQ1: - case SQ2: - case SQ3: - case SQ4: - case SQ5: - case SQ6: - case SQ7: - Debug.out("SQn:", iStaticWindow); - // skip the command byte and check for length - if( iCur >= in.length - 1) - { - Debug.out("SQn missing argument: ", in, iCur); - break Loop; // buffer length error - } - // Select window pair to quote from - iDynamicWindow = iStaticWindow = in[iCur] - SQ0; - iCur ++; - - // FALL THROUGH - - default: - // output as character - if(in[iCur] >= 0) - { - // use static window - int ch = in[iCur] + staticOffset[iStaticWindow]; - sb.append((char)ch); - iOut++; - } - else - { - // use dynamic window - int ch = (in[iCur] + 256); // adjust for signed bytes - ch -= 0x80; // reduce to range 00..7F - ch += dynamicOffset[iDynamicWindow]; - - //DEBUG - Debug.out("Dynamic: ", (char) ch); - - if (ch < 1<<16) - { - // in Unicode range, output directly - sb.append((char)ch); - iOut++; - } - else - { - // this is an extension character - Debug.out("Extension character: ", ch); - - // compute and append the two surrogates: - // translate from 10000..10FFFF to 0..FFFFF - ch -= 0x10000; - - // high surrogate = top 10 bits added to D800 - sb.append((char)(0xD800 + (ch>>10))); - iOut++; - - // low surrogate = bottom 10 bits added to DC00 - sb.append((char)(0xDC00 + (ch & ~0xFC00))); - iOut++; - } - } - break; - - // define a dynamic window as extended - case SDX: - iCur += 2; - if( iCur >= in.length) - { - Debug.out("SDn missing argument: ", in, iCur -1); - break Loop; // buffer length error - } - defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur])); - break; - - // Position a dynamic Window - case SD0: - case SD1: - case SD2: - case SD3: - case SD4: - case SD5: - case SD6: - case SD7: - iCur ++; - if( iCur >= in.length) - { - Debug.out("SDn missing argument: ", in, iCur -1); - break Loop; // buffer length error - } - defineWindow(in[iCur-1] - SD0, in[iCur]); - break; - - // Select a new dynamic Window - case SC0: - case SC1: - case SC2: - case SC3: - case SC4: - case SC5: - case SC6: - case SC7: - selectWindow(in[iCur] - SC0); - break; - case SCU: - // switch to Unicode mode and continue parsing - iCur = expandUnicode(in, iCur+1, sb); - // DEBUG Debug.out("Expanded Unicode range until: ", iCur); - break; - - case SQU: - // directly extract one Unicode character - iCur += 2; - if( iCur >= in.length) - { - Debug.out("SQU missing argument: ", in, iCur - 2); - break Loop; // buffer length error - } - else - { - char ch = charFromTwoBytes(in[iCur-1], in[iCur]); - - Debug.out("Quoted: ", ch); - sb.append(ch); - iOut++; - } - break; - - case Srs: - throw new IllegalInputException(); - // break; - } - } - - if( iCur >= in.length) - { - //SUCCESS: all input used up - sb.setLength(iOut); - iIn = iCur; - return sb.toString(); - } - - Debug.out("Length ==" + in.length+" iCur =", iCur); - //ERROR: premature end of input - throw new EndOfInputException(); - } - - /** expand a byte array containing compressed Unicode */ - public String expand (byte []in) - throws IllegalInputException, EndOfInputException - { - String str = expandSingleByte(in); - Debug.out("expand output: ", str.toCharArray()); - return str; - } - - - /** reset is called to start with new input, w/o creating a new - instance */ - @Override - public void reset() - { - iOut = 0; - iIn = 0; - super.reset(); - } - - public int charsWritten() - { - return iOut; - } - - public int bytesRead() - { - return iIn; - } -} diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java deleted file mode 100644 index b191f56..0000000 --- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.healthmarketscience.jackcess.impl.scsu; - -/** - * This sample software accompanies Unicode Technical Report #6 and - * distributed as is by Unicode, Inc., subject to the following: - * - * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software - * without fee is hereby granted provided that this copyright notice - * appears in all copies. - * - * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE - * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. - * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND - * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND - * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING - * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. - * - * @author Asmus Freytag - * - * @version 001 Dec 25 1996 - * @version 002 Jun 25 1997 - * @version 003 Jul 25 1997 - * @version 004 Aug 25 1997 - * - * Unicode and the Unicode logo are trademarks of Unicode, Inc., - * and are registered in some jurisdictions. - **/ -/** - * The input character array or input byte array contained - * illegal sequences of bytes or characters - */ -public class IllegalInputException extends java.lang.Exception -{ - - private static final long serialVersionUID = 1L; - - public IllegalInputException(){ - super("The input character array or input byte array contained illegal sequences of bytes or characters"); - } - - public IllegalInputException(String s) { - super(s); - } -} diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java deleted file mode 100644 index 7859780..0000000 --- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java +++ /dev/null @@ -1,252 +0,0 @@ -package com.healthmarketscience.jackcess.impl.scsu; - -/* - * This sample software accompanies Unicode Technical Report #6 and - * distributed as is by Unicode, Inc., subject to the following: - * - * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software - * without fee is hereby granted provided that this copyright notice - * appears in all copies. - * - * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE - * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. - * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND - * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND - * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING - * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. - * - * @author Asmus Freytag - * - * @version 001 Dec 25 1996 - * @version 002 Jun 25 1997 - * @version 003 Jul 25 1997 - * @version 004 Aug 25 1997 - * @version 005 Sep 30 1998 - * - * Unicode and the Unicode logo are trademarks of Unicode, Inc., - * and are registered in some jurisdictions. - **/ - - /** - Encoding text data in Unicode often requires more storage than using - an existing 8-bit character set and limited to the subset of characters - actually found in the text. The Unicode Compression Algorithm reduces - the necessary storage while retaining the universality of Unicode. - A full description of the algorithm can be found in document - http://www.unicode.org/unicode/reports/tr6.html - - Summary - - The goal of the Unicode Compression Algorithm is the abilty to - * Express all code points in Unicode - * Approximate storage size for traditional character sets - * Work well for short strings - * Provide transparency for Latin-1 data - * Support very simple decoders - * Support simple as well as sophisticated encoders - - If needed, further compression can be achieved by layering standard - file or disk-block based compression algorithms on top. - -

Features

- - Languages using small alphabets would contain runs of characters that - are coded close together in Unicode. These runs are interrupted only - by punctuation characters, which are themselves coded in proximity to - each other in Unicode (usually in the ASCII range). - - Two basic mechanisms in the compression algorithm account for these two - cases, sliding windows and static windows. A window is an area of 128 - consecutive characters in Unicode. In the compressed data stream, each - character from a sliding window would be represented as a byte between - 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and - TAB) would always mean an ASCII character (or control). - -

Notes on the Java implementation

- If the following byte is less than 0x80, quote from - static window n, else quote from dynamic window n. - */ - - static final byte SQ0 = 0x01; // Quote from window pair 0 - static final byte SQ1 = 0x02; // Quote from window pair 1 - static final byte SQ2 = 0x03; // Quote from window pair 2 - static final byte SQ3 = 0x04; // Quote from window pair 3 - static final byte SQ4 = 0x05; // Quote from window pair 4 - static final byte SQ5 = 0x06; // Quote from window pair 5 - static final byte SQ6 = 0x07; // Quote from window pair 6 - static final byte SQ7 = 0x08; // Quote from window pair 7 - - static final byte SDX = 0x0B; // Define a window as extended - static final byte Srs = 0x0C; // reserved - - static final byte SQU = 0x0E; // Quote a single Unicode character - static final byte SCU = 0x0F; // Change to Unicode mode - - /** SCn Change to Window n.

- If the following bytes are less than 0x80, interpret them - as command bytes or pass them through, else add the offset - for dynamic window n. */ - static final byte SC0 = 0x10; // Select window 0 - static final byte SC1 = 0x11; // Select window 1 - static final byte SC2 = 0x12; // Select window 2 - static final byte SC3 = 0x13; // Select window 3 - static final byte SC4 = 0x14; // Select window 4 - static final byte SC5 = 0x15; // Select window 5 - static final byte SC6 = 0x16; // Select window 6 - static final byte SC7 = 0x17; // Select window 7 - static final byte SD0 = 0x18; // Define and select window 0 - static final byte SD1 = 0x19; // Define and select window 1 - static final byte SD2 = 0x1A; // Define and select window 2 - static final byte SD3 = 0x1B; // Define and select window 3 - static final byte SD4 = 0x1C; // Define and select window 4 - static final byte SD5 = 0x1D; // Define and select window 5 - static final byte SD6 = 0x1E; // Define and select window 6 - static final byte SD7 = 0x1F; // Define and select window 7 - - static final byte UC0 = (byte) 0xE0; // Select window 0 - static final byte UC1 = (byte) 0xE1; // Select window 1 - static final byte UC2 = (byte) 0xE2; // Select window 2 - static final byte UC3 = (byte) 0xE3; // Select window 3 - static final byte UC4 = (byte) 0xE4; // Select window 4 - static final byte UC5 = (byte) 0xE5; // Select window 5 - static final byte UC6 = (byte) 0xE6; // Select window 6 - static final byte UC7 = (byte) 0xE7; // Select window 7 - static final byte UD0 = (byte) 0xE8; // Define and select window 0 - static final byte UD1 = (byte) 0xE9; // Define and select window 1 - static final byte UD2 = (byte) 0xEA; // Define and select window 2 - static final byte UD3 = (byte) 0xEB; // Define and select window 3 - static final byte UD4 = (byte) 0xEC; // Define and select window 4 - static final byte UD5 = (byte) 0xED; // Define and select window 5 - static final byte UD6 = (byte) 0xEE; // Define and select window 6 - static final byte UD7 = (byte) 0xEF; // Define and select window 7 - - static final byte UQU = (byte) 0xF0; // Quote a single Unicode character - static final byte UDX = (byte) 0xF1; // Define a Window as extended - static final byte Urs = (byte) 0xF2; // reserved - - /** constant offsets for the 8 static windows */ - static final int staticOffset[] = - { - 0x0000, // ASCII for quoted tags - 0x0080, // Latin - 1 Supplement (for access to punctuation) - 0x0100, // Latin Extended-A - 0x0300, // Combining Diacritical Marks - 0x2000, // General Punctuation - 0x2080, // Currency Symbols - 0x2100, // Letterlike Symbols and Number Forms - 0x3000 // CJK Symbols and punctuation - }; - - /** initial offsets for the 8 dynamic (sliding) windows */ - static final int initialDynamicOffset[] = - { - 0x0080, // Latin-1 - 0x00C0, // Latin Extended A //@005 fixed from 0x0100 - 0x0400, // Cyrillic - 0x0600, // Arabic - 0x0900, // Devanagari - 0x3040, // Hiragana - 0x30A0, // Katakana - 0xFF00 // Fullwidth ASCII - }; - - /** dynamic window offsets, intitialize to default values. */ - int dynamicOffset[] = - { - initialDynamicOffset[0], - initialDynamicOffset[1], - initialDynamicOffset[2], - initialDynamicOffset[3], - initialDynamicOffset[4], - initialDynamicOffset[5], - initialDynamicOffset[6], - initialDynamicOffset[7] - }; - - // The following method is common to encoder and decoder - - private int iWindow = 0; // current active window - - /** select the active dynamic window **/ - protected void selectWindow(int iWindow) - { - this.iWindow = iWindow; - } - - /** select the active dynamic window **/ - protected int getCurrentWindow() - { - return this.iWindow; - } - - /** - These values are used in defineWindow - **/ - - /** - * Unicode code points from 3400 to E000 are not adressible by - * dynamic window, since in these areas no short run alphabets are - * found. Therefore add gapOffset to all values from gapThreshold */ - static final int gapThreshold = 0x68; - static final int gapOffset = 0xAC00; - - /* values between reservedStart and fixedThreshold are reserved */ - static final int reservedStart = 0xA8; - - /* use table of predefined fixed offsets for values from fixedThreshold */ - static final int fixedThreshold = 0xF9; - - /** Table of fixed predefined Offsets, and byte values that index into **/ - static final int fixedOffset[] = - { - /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A - /* 0xFA */ 0x0250, // IPA extensions - /* 0xFB */ 0x0370, // Greek - /* 0xFC */ 0x0530, // Armenian - /* 0xFD */ 0x3040, // Hiragana - /* 0xFE */ 0x30A0, // Katakana - /* 0xFF */ 0xFF60 // Halfwidth Katakana - }; - - /** whether a character is compressible */ - public static boolean isCompressible(char ch) - { - return (ch < 0x3400 || ch >= 0xE000); - } - - /** reset is only needed to bail out after an exception and - restart with new input */ - public void reset() - { - - // reset the dynamic windows - for (int i = 0; i < dynamicOffset.length; i++) - { - dynamicOffset[i] = initialDynamicOffset[i]; - } - this.iWindow = 0; - } -} diff --git a/src/test/data/V2003/testUnicodeCompV2003.mdb b/src/test/data/V2003/testUnicodeCompV2003.mdb new file mode 100644 index 0000000000000000000000000000000000000000..ee7e12a1322d907d48feaad339b9b92f365edefd GIT binary patch literal 294912 zcmeI52S8Qj`^VpN?}f{}7f^BFxNZYcmY_HS87eYF9FdEy$P^HzG;lR68$Zpmer07g zw6wCaGONEW%XX$^rIwbJwzSlS-v9S`&pme_j?$lHJqO-NyPn7HPay&PU{`$emRdE-K|6Z~DM(cvx!e6d?=G(AmcfD~({zrRX zT%P>f!e`SbUUDF2!M^|g+WuINFSoQlUKJSHv>?v155?sv^sw_p%mL&kH|gLZ<8XEGpb!Sx_kbLl zN#9<=Lr4d=Mh*dRFi;4iJt{Rs8^qy74qgyk6F;uV5UL)IG$>VX3gL^$Ttf)e!4Hu& z>0m|{n{+Tiy~YkwE6L6U$#KP#5_h*}FajZIv#G0|2#2}`OSUvK*_bKvAhhVqN8N zR5`?>7@CHHT(T_OO+kUJN))32M+H_YF!~`2d15LgD$Y`2&|PF!M^w}w9c0*jHDygAkplfRDu|t!5*M26tXNMs#Yl&O&=T8ZgOPI-%2mB0CPi z!nWxsyra;rwlPzcfo$`4YoarNY?XFL#LhJaZb;A8wD0O*gh$zk?r6u>`V|~kq=UiU z8NlIi>N=O^wu*vXQwOjoZbtLD|FJ+}*e#+soci9gZ3W zt!Xx*iP{Nj$gV;y1=VBNnvqY@`fLDm9~d;RHJ&J`QD-Ik%$XW~TET6PrfOMdF7o3a#)E!?sI7<0iZ*^PopDGB{5ROv5t@y7+ z)yRQr4w?+%-l$SysoqwTK>9m}P|)|lgdPR?DEF75PM{3k$-U5_yb3i0)ce4MUuQQz z)Dc`JyVh&IRU7BN308Fv~A7Opw6uExH&YJ}$zyz286JP>NfC(@GCcp$P zL;{NP6ZLqPTY#MZFWh{}yuQ$-14qy;WwV^2ksI^{wa!M~U zD)P(Q`5;h#15&BaNGVH6NlVKtDJe=BomoPs!{5WZ zRZBvUjZ&W#qEC$-G-yy(Vr*L9#I!*}(*~r*W)6+b%*;qki_ILInKd9pPkEEk_^E+% zN@`2gyq1&|6eGLR(cjt8mJv6HU0zB~W==|x^cvu7J*_AuFLQKpK}AtoN?GRUg5qpj z;ATzxQnTH>X~(jHg4A)DX*LioPVl6aVJnke=JVp{Q7Q^3<1Qg@^{YkJu2q2*g zgk~h5!qvYy384^LkPrbOkc2)EWNDR@m-PaqysW37M0NEy3flbTs1zyk4-;SlOn?b6 z0Vco%m;e)C0!)AjG(kX7+L<+Ly{-ShcFcPwzyz286JP>NfC(@GCcp%k025#W|8fFo z<~If?2|M`BR2y-(J3+ElauDS8B)xzo8q$MKw&05#WRixrxnXo5}?^dz8lJ=RSV z+%>_d39|bD8s1~I?gO#1`vBRAhH0(vOK4%37bO&CrB`YgnzYC@VOq*E6I zQhuN&4AO*)NuXTx)PztHNclKTh}VR{B#^2JnvkdoNhF{XL~J)r=&lJpNIu)wL8CiK>XFcP}JDO?jGG$E1%bf$=n(u6*m5KRKQSH#9>LSIejM*=!r#Kvku ze@z%b0=i(tCTqeFO&Cgj`RTH0rwQ#fp@Sw^NFcm&&;r4fV?9Wa0}Dt{hY~>WvvuW& z?WhTzG@-NLjR`OTCcp%k025#WOn?b6fwL!|C^yi&?{afLU=Km>8;A`&%@bJD5d zPe)=gIG$}&e66lZ6aoTMT^wdY!&3-*+= z;!J=EFaajO1egF5U;<2l2{3^s2`I{c3|a}ljd;RbVD>gYZklBBGi~s_)c2Ur<35vp z+W5TbJ=eRV_X}Q;UhjF%_I%yrB9EEIcim^X-|x1SCl73rD_b%FCh*q~@TbKd8?d&Z z*+X2nBIffom+g(y_M}mo$P#s;TBM6yQ7cB^w_Fs8A)-oDi0PsdE~oP`3O+MLo|uYo z`Jx6P!X`%{O)bA^BO^o`95i6z$Kmi(h*(n*CLd{)ia_ZNt5;(45{ehqv5pc?%D-A@ z6T!Bjj z{f%<@8|hFO>xwo~*#8eGd@io0`GWouUH%3;6zp<>g5`_OoM=U&kXi@X_IRK*6p2YMwTqKDw7d8zauo+_O2kUu$@9H?HY8xRp@hpZW7WmPMr8R$dP3lR}D?^cGK=d zZBNN+-x%tdrHj27dY2>ODUj$ZB)sr6_aV;q86=Piu5=5iZ~gE>*9%=ea(a5;te-)o z@wknXq>d4|i5vfgIUwM?pFwoa?_bLJX>>%Keu&ttvy;{TxcF9jcf!R^HW6PBVUr&< zh`7Lku*ViF|6K1N+ucKVI(;(27j8uDL}`22Hk|524M3Ob#(%Lc5Ka-XcA4ej(xRso z0Z<>FuD=nE2ilMWHv4cTj3x(A+v9z;kQ z(G1?m;Ft_;CzA~nuVd56xdcY+F$ixpK?>Inm$J@2iGd8Eqeq5x(Ayjg!iz%arGuwa zh~tU}IT)mB6qOpn0EJ};H|gLS!iPe*BMg>?w?|ckBxT}8TFhN;?&L%0Dv`L;TgeY^ zsbcxG(oSB0-6k3K^o*z$4qsH%pf`9MyRT6QPd4=GCAUf^-=pv*sf3%!?t8Qti2_6y z``0U7i`95sOvin#zQi4G%z1bt;%r#0l~$|HYAv!_gB!{2ICE482`gbXNW?CepJ>93E(bDBZt~J|J}+JH1};Ep zBN@f65%QU*y_vIYg)e$fJ2)nQA+e|T&P;?DF5ln<@< z1egF5U;<2l2`~XBzyz4U1x7$os(m$;J;erdtvTF$z;wSU&(zNJq3>e5o92i2On?b6 z0Vco%m;e)C0!)AjFaaiT&IxGks%H~%!`TGZ!p3RBKVxriWkotHay-W-b5zm;W9*7KcQFN~S$FMl^^YbcS^w7DgFI zc%UcJx}%;iyl0sI*p|FkKU75gw&Jqgd)hS|F+H;3vX5JSU$LUqfiJR`?O3!T_)t)I z=Ee7XnQ{A#Q-6A@aA~Lhi(|d7ez>B;@Py(HQ&ZN?o}Bm9*q^s_x%*D96~S+`efXo> zLj%VDyft;?gRiaY{p-Q^wmdRC`sKxK?prZB@JIi=PW##|?6_>w?GyWSj>>y+(Vlg0 zT$>O$YeIOR9m6h)={j=ZSL1Fw@afb!Km71*W##vw>vknA-ILX4aHS)>2o7_Jbd*HIVsS_R@I%v@I+gE-2&3#`T zN*edIr-ZsDb@y6<#n-AO=_wMW0Ov?2cR~LOl_Yb=Ge^7RL<=Pnw z4s4&^(rwp~`Po-J8vEFWEsw69^zw@`Na# zsO^R}?+5hpo0@$8!p(-@7WcR6(c((oeg3)g7l*l*F3a8F`FYpsAuS({SU&apQQgba zFaEuU=bORC%*j4A8$92ivt!94LoZFue9&k8^hdt`bdz85#tDbMn6v57#-W#luZr(C zK?KB1-BCL6;qM2;?YaN)3(hE)ms<_iqjVe*4{Dc5ZX{gVzcl|Mb#T zRTcb;ewOFai7!93Yf!TmDY=FD59C)gA8>#F z1ud7(O`0ahKB9y;I9OSj&U zH|XkVscquAc3&BvurR&+l3kvUjt?0YmwPlMZQt&l9UuI1xc3ClN$nD!+3@z3AAb6z zzV(q#?;RZ4=f(+h%Lc4lRrTTQ?Dp9K9YQLv%U=?H?f8BE)%9Vyb4tGZ@5so=t7g5} zcFgqK9~^Z&r()2i+0k#h@2ubSviCvXqw^YKuWzulm>)hpH2e1TUpyQddhv#+mj>Sa zV_ffw(1KCZ6Xq0ee_-rA-km?aw3}!2ipQ=Ed*SAnl0Tl@dHI#5Pn88%r97~0`+%CP zU2`AaQ~AUTx9xo?wrb<>4}S9bb-%~fkABS+Tr2M+ClFxsd=&VmwU{U?|%NRn&tOQ zxoXz32g7_nKKA|%i>C}M?6xXve(HqzcYd%h*4*a#eU-Fh{9cnigZ1 ztUlQ1=T-q$1Dqx7NP ziATDBvm#Ho?CNnLX}1iXv}NdKW0#f5Hi6?KW{j$cteBKX7j~K4K-K+cPyI#0r-~>y`?@z3bs6PBdhsmolJdMj2KR0&D z>anAKx%T<#M?;pq_u95C)2sHZjh!&Di&xj(uX{H8_TKA8g?2jjT<_O5%^BY3x&8VA z(*wOnwYqe|pcV%&?`@4fx_!)Tb?+2R+t8u9c#78>-S2$ng)zG?9r@1E@J-F4n%#R~ z@BIH8uxQG!hd!(7HsFo_^lTHkXZn**=Y6up7YOdZ{2F$|K{dR->#dHw(2prR?0{39C|X}|Fy@`+fGfd z?z-y3SElsLyXm&c`xkBreJ?e1P}Om>kB4DQ^Iu2Qb$qPT(>;|M!z~?t?AgV8!HB!} z1-)K$^w(7*-`~IW`gtEcbA3r+#n=5_fAz@?-K*W)^E-C9xLeZB;WuY3D!kGCvE9!F z_G-U$ve&^@TV8L`{+HPUNAJ8cI9dOGVByV|Wc+Z?v6i#@_%BGgbJ}y8+kW?Y(GRaa zGBNy-^rBVm^JjVcFL`QP(boCTJaEx3eV;DxlzD&K+NsBr(k6Zq)}r~!{9E^2())*x zJ0G3#&iAt;lO}I&r<>DzS+j4y`uK$zp)bUbzoz*!*CsdfX!Xg_l`|gbvUKRwDe?bl zpV4{R+FRc2)4u4Hqif!Lt^bG_?st6faLui~4lW(|;pXa?d)6$z*)L?C|Jcl*GiOfL z2fNqbGy1Muwk5UJZ`(1bpk@0@e~McB(|cLRKg(_3t32{xPUxX=zr5}>aLs4KrmVfD zbj>HjZd_LJQ&L91N2*p#c<7URb{DoEe$9UtUppjd_%8G6Nk1OmfBA^CtyMqz4j8j~ zN72t!L!-Am(XCtmgWrt%&2#ZGkMDnK^Wm&r>kNC|O!)HUS7U2OubF-9=GpUibw?yg&tLg#uYDi=--NY;Lc(W$ z@c1`bVI57a)-8%ynCh8y*JtCNY8Lo?bn|!3KfK%O#h(uUbpNJ~SLU?;utVOW58jBo z+TYrtYxY}5w_P-T$ahED&bm0y@7n(ThebSkNpa@B9ogUPpTFAjiW-mtp0r4v~GP5u0CWgocUI=A#rQcF#n+;b+al< z$1Z*5)y2=;-FxG>n=ajR)9L}~?H_+`+35Q4h`YZ}wxqxN{(sq1vX;crbZe&XiVT`yl?YTjj=*VUh7Uw7BEyRP`Q{fJh(Vr$+> z-4itQwX&S>RwbX`u_NaFt2~Zhve7nZ1Hh1Z;C}Y!_2d5YUdzD>3BC%+vVR^)`&Odh_|EbT~Z9P+;ytRM!B~_n= zKHcHvXItF1e(9UZvpRqJ`Q{E02EW-g{Yuu?j?3z~bM!FX)OGo_|5qP%WwXq8-i_>f z^Mvb_+U%GNfC-#`0&={(ZMuW+KcS|l*S$oX zXs&9Kf%!&gViOt#Zo$+&Q?RFT<-IVjd&BLRd_fVYZ;%r+w78&@W@VWG6JP>NfC(@GCcp%k025#WOyHaoFev(ymj4AR z6nML^3CQw)eL+@Mc3P&fY;0yuX4|r{In${wi0cF{>{H9yGXW;R1egF5U;<2l2`~XB zzy$sf0dD{QL%1$D&)oih!JRTzmCr#wg$P}oB_isYLOppkv~7VrZ|Ny$KVTEkj?`GGt?#hXrVnF zEtxu4*! z2`~XBzyz286JP>NfC(@GCcp&F0Rcr(Oj2vdiZ?dJ=c4nx4XCdmJ|P0zCPl0-`AggZQ5%+UfZ?#fz5@R zyKVk-$dVxwhO`;-aq?N_kApJ-Ccp%k02BB_1n}ay?H1GJd*fcPsC(+zHbRUP86rg_ zp?lXT;SU??Mb=aF5M4wd{sM#-Y>emxmn6ikGWexWsr%X!g3rJNyIK@sG7{G)5`y2VZ57($l5geCu14W~5 zu8}!I9wcH#w7lrGOTLK+OAyP~(9PYZn=5ejl0S`3QxU~zPZ4LdMR7X#g9T}Jnn;Tg zDb`zbl(;zl(J zE!b|K+hs8FtOgr_d%QwSMJLdFbONP&iz1<0G}!efHHxz7$G1@0+MbFQflisjp)mNR zi7fOXtw!(EJhDJ1*hKjt(QaRF*wLeGgq-BNuj_@+~=kf#@c? zL^nXtP3!VRJ~8*6h%N#63XuC;AarQc#Dmlk+^gmedBM?zrCxBvfJ~FusfJA3qjxaQ zdp^=D#tExA(^R$K)N48oC#(SHztknqr&L-CPOHq1M6gOkQT4}Drw-@IGH98T;`9Qy zoQ$@1pHhGI#^{kw@)bb-q5bqWXm;hW88W6O$O|<@IwWCWr=;+t$fTI0zUp6i46J><#tp5guCXM|tSKxn zD6Fy!D=(^;;Oo_GC=#VDeBkAgeIp|hqocw?6O#Ldg@%VGB!~7*ii!*k3yTj+=ocOy z7S<=Gme%V(2BpvF`_yfG^hZcy`kRcZhtd zUx8G{!#)EK>NNbOi!0!JJmg{Wxu1dOziOkty~o?$;xX_;Hc27|@h9S2E*0^Q!E3r2 z@7i3vZ7p!64;NbU#bSF8TJRpJ#9yhjr4dWTct4iIm-E(M(KtlGe=`0OpnW`Y8!GzX zs2~19p=B7%U`XVLIH?@srOz-C3EMDfVkC!t&@&7=lF>NOr|)FE+H>(?Lpm1Qz7i}b zX3xY|QM9;7vY(`3-QZ>9trsC1=m#Q*%@{x#^e0vwIDxq zw!&omdG?QT3&I!Uvy1es!nd{a_H2{aOeErPToumJnO!sV`R~eVEHo&`cT@!`9IBB& z^&5XI@-`NfD97}DP44uaMd528M1sH+SKM4`CmjB23uRnpXjko5KWR;G;I|kj4boP4gOK!sx)W+Z5O2<!rc&`pI=yAtv8Gr9Dn3|?{JKLm~HIO_F_7AycseV|Tt-7Y5qO@>v z4#J7V_>4Q}A}%pz@UXwmJiTPkvcb zg^GGhC|^9h3{VBVKA5y@LVt{0byZL`1a}?(JQQT7(O;)#{+i(-kjCrv+AnHX84Vb zjvB8ojOc||WM1ct)FcbujWrcz`0A}lE*CjM2^9JUU7|;P#vok^y&0xvi1<5&*OCUs zExu2Wq_FUy_&#p&Bk)Nwxhl8Jl8on7VYQ{l%)avWjv=?>RoLIKQd_-z-Hn zmaHkcRUU-};~NHzFUZ9!DLSlIL1}4c{Z(e+V~Ok)ZNUp?YGHnjCABywuPV2yP7H?{ z7|Q*!JduM8Xm0SJ$MLWcc)KNuNgErC>k^>Pq@?gZG3_QrCM1kl+Zz|l_!mSp#>yu} zCtWhCSB4>Vut)fq;_B)6BD4(7tuD^Dh@y%rOMGQzsi8Q3Q%*H~c#FdF>f*fO(&86t z>MVVFMaG3Th)(9g%Y|sRNtw4s=Qr2Qpk(@L?f-ODJ8wFJho2>`0VB^F3(me+{yP(} z&Hoo9NWl9nNFf1LIYBxSP+khslYlC@AOi`g+6!_c0ab)S?j)d!F~~>)sw#s#NI(^6 zkS7VKP7U%R0Z)z~ZxT?M8{|U*s)vJoNkFYqkck9TYX_N0KqYyQ9|@>r5Ar7gPlBLs zB%q-ys5=R0C=2RA0vg?dE+PSqcR|5|HzvRYm;e)C0!)AjFaajO1egF5_;Umd%1)n? zmH*>VWWxk#8(o>~duS^*%B?6m;$$X*p7kL>jU^2lB(Adl=d z1M~#b3$X-DpYb5M-6tbqlUR5BE?9~Las=;38Adl>|3$pgX zUJW6u7wq*6^2lDJAdh6V0#OQ}AA9|SJd%|V=iwxA5=?*zFaajO1egF5U;<2l2{3{4 zK)|5%IcfQyT7K~#N53du>F9?#!g!Dsx-j4f0{+n%g>0q02em2tLFVo6KtL`3r{!d% zrWBXZ8B<&EmHM=blCpxF(ZvN7MXJ4QtXJwYQp!?N($X?ZN{UiOXR1!JtzUIYO)0Ti z%65OHKC7TONA)H95h(T9IhoT-iZatwOWE?T)aMjrR7|u5l)VdJRZ?4`CIm%_kyePl zxS*g+>L`0FDD_3f1x2Kfw3D?1N_}x=Q9*H;8oBJSpwyS76=xUOtYnV`rG8qf8c6nA z&^4770;QCtjg?W$Y6y?|6v{P1+LM%B5h~K)=T(Pp6oFN)7i0GRp&^5va^G~JqKz9Mtz7LT7>AG zL(6^=0hF{1*8_?2#sruE6JP>NfC(@GCcp%k024Sl0iE(YP2?edhtbw}qv8t{DQL@S zJyqwI5M)TFHT1~AwsnLT7_(!|IL9h_H!eZH0W~s;h8$?Z2+Thagy9b4prZ-{Ia-(Y zw9OUXG&Y{vKAA zgW9z5%5o|P{$&D8fC(@GCcp%k025#WOn?b6fqw%5MQKNKfyzy9TmMfx<~>cE3Xyc1UF!k>!F-ueYC(bzans&jau)9M-2 z=UtAxmthkYwN3jx)GPkk1wFz(Up4->rb1#mHo1+)S}e564z1ltsVmA_vqg1+cF%M9 z-^PM>On?b60Vco%m;e)C0!)AjFaajO1pY<@+)k(>plS){>V;QG+fG8L4g5kbA>2s> zV(m2@#`Aa3DQ)EV|D71AuMWVMtHarADyYVR#^|RTu^^mLiZrHQbVbzx3>K1;hOFes zeiB?q_8(Uo%>|$h`Nt4`$6Nr8NdjoBzl=(q3xJmN)@F_9{mGZY(2aw$fTcs$xhW7TE7)Vwq_cXbi zA|>fY_Hv`OKS?mCi=MkMpr`}}k6nE(@D0!)AjFaajO1egF5U;<3wbOahs=iyA;>HM5cB#M%2?qW+o zFFrQkXP(Yu08R&--I)LrU;<2l2`~XBzyz286JP>NfC-!l0UDsM_M1--)6ucK2Hnfc zWIuZ9Zr@(C!CxS{q`RZzxDLJUCmNl)^jFKk)G3^dHuqmA87xqC>^I3kGmOFy9;mmz zW&(r>e_l>8k1;52CawGrpDvrupvCTD%?B3{0a{@mr@xyZ0T)a+xz;^~ZFHlnhc_m` z1egF5U;<2l2`~XBzyz286Zp3f&?zQg%Ih~M4$=^MnkSHq9OO?9%^<_r#{X0Oge*8- z1Zx=bN^=0vrUfNgj_fBv2-i{f=V&efgE(h%0la6xd;?CS|J#cp7(-uyikV80fEfX3 zeE(z|SBW}7Bm32{^&TP}Ze?)m z|F1Krnmd>enqD@|GG&-Lab184hkuy>6JP>NfC(@GCcp%k025#W|9ArGV0G%$OC7wa z<8L4A>6iwoYeULusJc;L%sG`ssYkdcjMht-j+^uhd+C{q(#}KSo%>E(M7TX7>aH9u zm#_<#1KHJ7>s1xWktD5Lj^6AiPnA5A2Q?@y9d{bfol$cM@=M1{*a&yUQ*F2&^L$U;<2l2`~XBzyz28 z6JP>NfC&gL|NpZu0xGVf>Zeh$pGxww*pG^xU|H-(ZB?)=_M?s~SQh(HGZrk1{V3_H z#eP)j1$$70s0a+E`aRy5025#WOn?b60Vco%m;e)C0!-k7A)r&r<@|pO(GpCvxcNYH zGFsOGYWb&r2DbVCsDA<-ieB&nhY!d^z$qpDzOg`u?qi{IXAO`38|4dGE&CyJK9+>qnAM^i}!{0DbfMaduzdtmq zANsIN#UylAYmNN%lfTvY)d6D|@<4O>HDIqwdYTEa(S}L>5T5?^2orAe>6q0|2Bsga z-wkFqq>VVy`IP6>;U*7VX6~c2OPxV%Gxs$nq2_x0*^!gG~OWy|RaZ>W6=s z025#WOn?b60Vco%m;e)C0!-lVOF*6DPAjcaH|+=*Zy0sOM?EgHP+3JavJgdBjfZnLN>~gQVPPJ(DL34D}R0t!MJo8pbiD-zjL2 zr+_o{Ont8@0ClQ9ts3Ns0#LogX+4uC3P5M{w4TY6R@>2pXM^&%z5EZKE}PT*e;(&# z#RQkw`QUsIu#NxkEvNm%wa{Bm`$vKt|4)J(|4)J(|4)J(|4)J(|4)LP_KyTP{+|Rn zp&$ufS`zfE;Ef3|0Vco%m;e)C0!)AjFaajO1pcN3xcvV&Ju&AB*wQ6Z+5|51&mmj6-trk4LvWv7<^QPHQC|54;u%m1iU?9Ju>b9Bx) z8YaL5m;e)C0!)AjFaajO1en18f`CpLX{LDZgVE;yqhqw}%B@CNL_FqbOQ#w5sMCRM zi-Z@LCg=I9pc(blN#`gUr^)#&bJY+!*(Jd-2cJ5RpX=;;9%7WJ6P1`D4inz-#sruE z6JP>NfC(@GCcp%k025#WOn?cT1p!4VkmY}hLMJwuYwbs85h({{0!)AjFaajO1egF5 zU;<2l2`~XBzyul*h--BGrw;#gB#QE|ndU#BO%WU4Rwe#r0!)AjFaajO1egF5U;<2l z2`~XBzy$s_1dJkK$|p-qm+y^x?v>qgUkTm5Rz#d;IqAdaBMZA;^%fgO%ECU(O~N4KFY>*CKhR z%~@_VE`Xe9vyTg)1mw5?TRO)TT3kL66H$&9<;F6%Qn6YMa^ZVV`AY)= ztk&c5w^Ro$`cm9OMAbF4mthovF!|Dl8q!0`W%5_(5vCt`#rpFNw3k_z0{vLA(cWN? z+NmQMY)Bk#%?&ATz5h))aYaY{$T=4fC(@GCcp%k025#WOn?b6 z0VeQQ5KxqDviwhJ8)X0gaPtAX&0nDrr^y7E025#WOn?b60Vco%m;e)C0!-lC5zv;z zJF6JAM)I?Xq~UA=YhnLNfC(@GCcp%kz`uilPWJG&9#?dFgPXh2!_&*#$Jb=`^ABj&yhUKkR;}9vwQbkF zgQa7q&Rx28>)zv{;E>?PTkjvGH=;-sA1y!?W~qRCT=r{_Lr;m3oAe){?Fk)yx-`rGfvj$4h2 zJ36iFsnSYAzZsu(-& zyL8`C_oiPGwlM6f!a&Uti?E3w^pEOmJ6Oa^`nnD;>AMSy@awQnKMOXu=`Zi_gMNel zHT?_l(@P6PbMdz|^fkm8d^9Vn)36@jySJp|aCPxsKh2tIDXhmU%1i4kxz+i_#g_aj zxmDFR)eOpqzFPSo{#-Vl!NboI*I*wD;Nr*!7X$&@`hPLn`hPJB+2G=d(UE{lFGf!S zF3=bQ3Am7B+(^J>9^*~|#&^VM12_b4On?b60Vco%m;e)C0!)AjFaaiTVG=MX4|$!e z{Qp165mi#e3Dj>0KPr7<^N!HYY}2-V zbl=zkgW?7!CJ(iA>J=8zrCUmBT1IBp=&{-3#!t-2D=3;$TvApst$KRx%of7F5^-$EqAgiTgfx#@ z8}&%kqaiK&7DO6j0%8JV+JpqAs9R=QCbn#B8F>Y?MS2!gV5=<~TT*gTKVF)jT19sr zH@$0G>Y>>b(7IV0U0dv%k(tt3+UPpcfk~&6M<;i0rYK5-(OnN?fN_IyhcUulV4B0UfC+?Y3DXLuHB1|rAegodG42fw!`wot zp?)eQW&HSplI#?vSu>rE{>fUYFUXr%YfLFg%g#2YjZG;op{h?*eO2U4OsObSq$l_K zx)yal#=7=(?TvLNBMw^B1=e-03##)`>?c%Di3YhhP;h;HT|!;|x}h>IUHuDBKJ?IC zzAJ|;f3Ch+eZrH6)hiAGthxrP44^*M5R^A2zyz286JP>NfC(@GCh&J7peUD9Yp~oh z#J5=gFW)@EJkT6$UT3 zq&%v`D&6S={IA*!!c&L_tJN&RC;MRTL`4`KRuLOV^t?Fg+tOVntK2=i#`%3XWXIBw z%>~m}cG&vXL+_0_+%jTzhPASU&KWoef?{(h)dvIdX!MWM{N;h8Lb<(16zu%WV zy749wVHRNyP0&Q& zSUJt=JS#`ZHd2nNm#oKK$Z1yRSvf6PCvwy=r#Xz%X;rP<#d%hak`O9{NTCz?z1ij<` z0!-?sh0Co;s|_`Jk!n`w8IzW*6Q%%C&gorcHpt17$?AM6=aj6b9IR0H+4wDw-g=F# zkOM#JmU=$*#mA7EtjiNu(?rVxeBm3;+=#Dcbw2e4Of6YWIhz%IyM*VdpLnpsOIhW6 zqLouX&DVL(f|jiIl7qfD4~_NJteo`Ktj@E(TCz_1;`z{6U(L!%U(M<~>#HT}s;}Ak z;RWRIC9TfCa%RQ#qlYwUbp7Jfs4-v7%89RLb)K`JCF{f&cV%OJH7h55HLLTiua>Nn zzQ}^P(PsdjdX2N7Svh4vvpUaN(2})hK~V-6Q^&-o!a?Bvd}omPm}w;U_9h&>{T&FT zjf@a)(3=DOM228>2wEkgtr&y#7D}=1LIL(Q*jHfHh>>DC>~rD6fUkrvSZh1|xL7C_!3k>H&5eRW}ON7OqpUr`#qZ zJz_TuzvMO(IgGcHD@SakyCYUDTmr(C!c;kh>~zYIsy&MUis+6(X;sUjZ~sl)C1EE$LVYdn_U%InlL6eyg4G-IV@xZYjUcaS$ImpAKS8 zg$xv`?Q}3Lhpu#XYM_U!JzcBBxC-ap*}r-=$)((O-BlxBqZ&C*&@@*}MqcT9Q|}TwpT)2%$5j=N{ba;bh+p;Gjso3E95)w9IQx~- zts3FgbeqdFQEj^mG{2<(boiur%Aso=uG<88HwVb;p%Sh&@=mJ8wb%+=UGF8j->1XQ zPsTnCGOcVw=dr!KKIqDtAufeY6{J+`%b_n3ZEL#%wEK@Z()COJO2K6qQcZ$USDa~Y zJI<0Z(3L{k(mk%acWJaj4aHXG%zz(QbUW|^i!PliTs*FiLw;cK1M6@0z`_Hp91q+P zc%BD0zuHZa8)@kW0FZ`-YbhQ!++QR(Aq3>QDc<_jK!U|pZQ<2$Qzwp7Z z(w6W;D`b(}3NlJe)NrP>CJBPBJqd6%bg=pK6pF5+X4wgrv^@@^$i{`n>w#&BCoF24 z_IapR{Id&sgnhnh{BK&~-=IQFOB`i0=foAItEz>fbc3Lv5f9_6b$ED_4n1scx{Dy_ zVWR|-r$3-G|+B7Y<&Ybi7t@~FDxpPNZ#}~W3yqOZxs|g7?nq0gigtt*_7G7_d zMpMYfA;|8s?EW9ExomG7$+Xeinod-qdU(Q#pwe68c<8HH-5Qf?%D3wpY_@=ncA!kP zV~sZAMg6&TBp|y&fj^*D&nqkyZ!P~mnyatP6+YcfQrv-dD$U6^d2Rtq)Qd$@GtAf4|Xze-mkxeVRRnQV~ zu&EI(WKPxdODWL5j37)#;DysdAMJEneDFa+A2U-BoQe|KCqgdnvpR9TQ$%e9-B=W?WYQdg9S!wY+ zx_>P))pE#cVT9Z!!7t^8g-(Dc@WR}C(jT6FaajO1egF5U;<2l z2`~XBzyz3pGXW1F{AF2KTBf0VT89!RHH+1vK8eb^g-#{TYLsSA?SpEaGEljp6}taS z2uBH*Dm_k@Vl}G!3s3>`lDJ1KLCPjnYFPhRwy0f&Dtjl(;y$&jonGxT@!`mbYYKHx z`Sclc4WnvBS^Y$ODDD+kLXufklxE&q`+rNc|DX9^hPU&Tz%mnH0!)AjFoAPNfGX(k zq;8n9kv|<@55rJzor(HuYFn*?tToH`K>fSc(WC(OTGt1y9$ahgrw$#X@q#XtVX3m8 z>cgvHitrM*pyoawfAk8c`t}GBhQAm{`S3x_{Pe0Xwl%+yEp?rtbbFw_o$CInww}_| z>@5gaE^VlT0;L-$yWrG9a`j0aEgWf&k?vZT3QMDJDm4YiM@Jpn0jOn}x_C3iSQBBB5AM_fh&oTGjZoD{cLaD) z0||BRAgXG!33U&tf-Rk+EbOTzggTj!5AEELc5Q5}jMUwOa!XxKwA6cnBe7LGCsiQ@ z>WZO$Y!!eAY2)a~R4uwWT?1-^4xLxhi_WOZh4eSsbexK=SfpipbdI8SmkeA3MY0Kq zzFSjeOG*vw+crU8JDb$)%JqBK;?$WZ_yjlC&oZJzp-HRuC-~JmkkL7+Yr;KCQl+bw z+@q0uVobCgvC8shlQ? z8fI)4OFsGaMn;th^V~R6*nl||^@GbI{&pG~K0!)AjFaajO1egF5U;<2l z2{3`v640YY%emt|)mk;j7z%e(O6i4BkBVm09ie=DV)MCkb2zp8R?8}{2vm~U1Jf)% zO$T-~A)nZ?PC5+0&`oLNU`N%^x$<0Gf{ItWX4IEn(`+O%Kt1)f0h!b z+hBC~nuIQ0Bhj5L1(gE{sGXpC52_IuEGt|(p`suFeWG-jP_|xCmOGXI*K+w^CCR@` zfC(@GCcp%k025#WOn?b60Vco%{z?Ky(el$zW!XQht6B#z4eFq5-{8>7wd_9wL%J|W zgDn43cZ+Dz#kKsu(y9Fa1egCE99d!lOn?b60Vco%m;e)C0!)AjFaaj;*Ag&_Z-3n> zYXO$;*d_N>P^h;e;=FA=0bJ_oH`R=0!)Aj zFaajO1egF5U;<2l3H&<=7)8$P9kPVKd)#MoUk(tMiL^oEL6aPip^c_OP^T_d#{p=w z6kt6A+5Vr75dq1VIl`s<|FBc}f8z}@9{ynhOn?b60Vco%m;e)C0!)AjFaajO1pYn* zjNQYYXE}g|MNX(^Z#-A|KE9SajuyF z6JP>NfC(@GCcp%k025#WOyKWDz$h+#>Q>p(f3LS%2T%(I&)xigT>k%iop08L2`~XB zzyz286JP>NfC(@GCcp&#%><0%1M-+Wzh-kAUs rU;<2l2`~XBzyz286JP>N;IATJ6wTHzktO_Vmfs=w)llUe&j0uSrAe>( literal 0 HcmV?d00001 diff --git a/src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java b/src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java index a25c6e8..6f101fa 100644 --- a/src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java +++ b/src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java @@ -1492,6 +1492,35 @@ public class DatabaseTest extends TestCase assertEquals("Row[1:1][{id=37,data=}]", row.toString()); } + public void testUnicodeCompression() throws Exception + { + File dbFile = new File("src/test/data/V2003/testUnicodeCompV2003.mdb"); + Database db = open(Database.FileFormat.V2003, new File("src/test/data/V2003/testUnicodeCompV2003.mdb")); + + StringBuilder sb = new StringBuilder(127); + for(int i = 1; i <= 0xFF; ++i) { + sb.append((char)i); + } + + String[] expectedStrs = { + "only ascii chars", + "\u00E4\u00E4kk\u00F6si\u00E4", + "\u041C\u0438\u0440", + "\u03F0\u03B1\u1F76 \u03C4\u1F79\u03C4' \u1F10\u03B3\u1F7C \u039A\u1F7B\u03F0\u03BB\u03C9\u03C0\u03B1", + "\u6F22\u5B57\u4EEE\u540D\u4EA4\u3058\u308A\u6587", + "3L9\u001D52\u0002_AB(\u00A5\u0005!!V", + "\u00FCmlaut", + sb.toString()}; + + for(Row row : db.getTable("Table")) { + int id = (Integer)row.get("ID"); + String str = (String)row.get("Unicode"); + assertEquals(expectedStrs[id-1], str); + } + + db.close(); + } + private void checkRawValue(String expected, Object val) { if(expected != null) { @@ -1536,7 +1565,7 @@ public class DatabaseTest extends TestCase } static String createNonAsciiString(int len) { - return createString(len, '\u00C0'); + return createString(len, '\u0CC0'); } private static String createString(int len, char firstChar) { diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressMain.java b/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressMain.java deleted file mode 100644 index 52b9e86..0000000 --- a/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressMain.java +++ /dev/null @@ -1,574 +0,0 @@ -package com.healthmarketscience.jackcess.impl.scsu; - -import java.io.*; -import java.util.*; - -/** - * This sample software accompanies Unicode Technical Report #6 and - * distributed as is by Unicode, Inc., subject to the following: - * - * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software - * without fee is hereby granted provided that this copyright notice - * appears in all copies. - * - * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE - * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. - * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND - * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND - * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING - * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. - * - * @author Asmus Freytag - * - * @version 001 Dec 25 1996 - * @version 002 Jun 25 1997 - * @version 003 Jul 25 1997 - * @version 004 Aug 25 1997 - * @version 005 Sep 30 1998 - * - * Unicode and the Unicode logo are trademarks of Unicode, Inc., - * and are registered in some jurisdictions. - **/ - -/** - Class CompressMain - - A small commandline driver interface for the compression routines - Use the /? to get usage -*/ -public class CompressMain -{ - static void usage() - { - System.err.println("java CompressMain /? : this usage information\n"); - System.err.println("java CompressMain /random : random test\n"); - System.err.println("java CompressMain /suite : suite test\n"); - System.err.println("java CompressMain /suite : file test (file data may include \\uXXXX)\n"); - System.err.println("java CompressMain : string test (string may include \\uXXXX)\n"); - System.err.println("java CompressMain /roundtrip : check Unicode file for roundtrip\n"); - System.err.println("java CompressMain /compress : compresses Unicode files (no \\uXXXX)\n"); - System.err.println("java CompressMain /expand : expands into Unicode files\n"); - System.err.println("java CompressMain /byteswap : swaps byte order of Unicode files\n"); - System.err.println("java CompressMain /display : like expand, but creates a dump instead\n"); - System.err.println("java CompressMain /parse : parses \\uXXXX into binary Unicode\n"); - } - - static void analyze(String text, int inlength, String result, int outlength) - { - boolean fSuccess = text.equals(result); - Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED"); - if (!fSuccess && result != null) - { - int iLim = Math.min(text.length(), result.length()); - for (int i = 0; i < iLim; i++) - { - if (text.charAt(i) != result.charAt(i)) - { - Debug.out("First Mismatch at "+ i +"=", result.charAt(i) ); - Debug.out("Original character "+ i +"=", text.charAt(i) ); - break; - } - } - } - else - { - Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes."); - Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%."); - } - } - - static void test2(String text) - { - byte bytes[] = null; - String result = null; - Debug.out("SCSU:\n"); - Compress compressor = new Compress(); - try - { - bytes = compressor.compress(text); - Expand display = new Expand(); - result = display.expand(bytes); - Debug.out("Input: ", text.toCharArray()); - Debug.out("Result: ", result.toCharArray()); - Debug.out(""); - Expand expander = new Expand(); - result = expander.expand(bytes); - } - catch (Exception e) - { - System.out.println(e); - } - int inlength = compressor.charsRead(); - int outlength = compressor.bytesWritten(); - analyze(text, inlength, result, outlength); - } - - static void test(String text) throws Exception - { - test(text, false); - } - - static void test(String text, boolean shouldFail) - throws Exception - { - // Create an instance of the compressor - Compress compressor = new Compress(); - - byte [] bytes = null; - String result = null; - Exception failure = null; - try { - // perform compression - bytes = compressor.compress(text); - } - catch(Exception e) - { - failure = e; - } - - if(shouldFail) { - if(failure == null) { - throw new RuntimeException("Did not fail"); - } - return; - } - - if(failure != null) { - throw failure; - } - - Expand expander = new Expand(); - // perform expansion - result = expander.expand(bytes); - - // analyze the results - int inlength = compressor.charsRead(); - int outlength = compressor.bytesWritten(); - analyze(text, inlength, result, outlength); - - } - - public static void display(byte [] input) - { - try - { - Expand expand = new Expand(); - String text = expand.expand(input); - Debug.out(text.toCharArray()); - } - catch (Exception e) - { - System.out.println(e); - } - } - - public static String parse(String input) - { - StringTokenizer st = new StringTokenizer(input, "\\", true); - Debug.out("Input: ", input); - - StringBuffer sb = new StringBuffer(); - - while(st.hasMoreTokens()) - { - String token = st.nextToken(); - Debug.out("Token: ", token); - if (token.charAt(0) == '\\' && token.length() == 1) - { - if(st.hasMoreTokens()) - { - token = st.nextToken(); - } - if(token.charAt(0) == 'u') - { - Debug.out("Token: "+ token+ " ", sb.toString()); - String hexnum; - if (token.length() > 5) - { - hexnum = token.substring(1,5); - token = token.substring(5); - } - else - { - hexnum = token.substring(1); - token = ""; - } - sb.append((char)Integer.parseInt(hexnum, 16)); - } - } - sb.append(token); - } - return sb.toString(); - } - - public static void randomTest(int nTest) - throws Exception - { - Random random = new Random(); - - for(int n=0; n < nTest; n++) - { - int iLen = (int) (20 * random.nextFloat()); - StringBuffer sb = new StringBuffer(iLen); - - for(int i = 0; i < iLen; i++) - { - sb.append((char) (0xFFFF * random.nextFloat())); - } - - test(sb.toString()); - } - } - - @SuppressWarnings("deprecation") - public static void fileTest(String name) - throws Exception - { - DataInputStream dis = new DataInputStream(new FileInputStream(name)); - - int iLine = 0; - - while(dis.available() != 0) - { - String line = dis.readLine(); - Debug.out("Line "+ iLine++ +" "+line); - test(parse(line), false ); //false);// initially no debug info - } - } - - public static void displayFile(String name) - throws IOException - { - DataInputStream dis = new DataInputStream(new FileInputStream(name)); - - byte bytes[] = new byte[dis.available()]; - dis.read(bytes); - display(bytes); - } - - public static void decodeTest(String name) - throws IOException - { - DataInputStream dis = new DataInputStream(new FileInputStream(name)); - - byte bytes[] = new byte[dis.available()]; - dis.read(bytes); - - Expand expand = new Expand(); - - char [] chars = null; - try - { - String text = expand.expand(bytes); - chars = text.toCharArray(); - } - catch (Exception e) - { - System.out.println(e); - } - int inlength = expand.bytesRead(); - int iDot = name.lastIndexOf('.'); - StringBuffer sb = new StringBuffer(name); - sb.setLength(iDot + 1); - sb.append("txt"); - String outName = sb.toString(); - - int outlength = expand.charsWritten(); - - Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%."); - - if (chars == null) - return; - - writeUnicodeFile(outName, chars); - } - - /** most of the next 3 functions should not be needed by JDK11 and later */ - private static int iMSB = 1; - - public static String readUnicodeFile(String name) - { - try - { - FileInputStream dis = new FileInputStream(name); - - byte b[] = new byte[2]; - StringBuffer sb = new StringBuffer(); - char ch = 0; - - iMSB = 1; - int i = 0; - for(i = 0; (dis.available() != 0); i++) - { - b[i%2] = (byte) dis.read(); - - if ((i & 1) == 1) - { - ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]); - } - else - { - continue; - } - if (i == 1 && ch == '\uFEFF') - continue; // throw away byte order mark - - if (i == 1 && ch == '\uFFFE') - { - iMSB ++; // flip byte order - continue; // throw away byte order mark - } - sb.append(ch); - } - - return sb.toString(); - } - catch (IOException e) - { - System.err.println(e); - return ""; - } - } - - public static void writeUnicodeFile(String outName, char [] chars) - throws IOException - { - DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName)); - if ((iMSB & 1) == 1) - { - dos.writeByte(0xFF); - dos.writeByte(0xFE); - } - else - { - dos.writeByte(0xFE); - dos.writeByte(0xFF); - } - byte b[] = new byte[2]; - for (int ich = 0; ich < chars.length; ich++) - { - b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8); - b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF); - dos.write(b, 0, 2); - } - } - - static void byteswap(String name) - throws IOException - { - String text = readUnicodeFile(name); - char chars[] = text.toCharArray(); - writeUnicodeFile(name, chars); - } - - @SuppressWarnings("deprecation") - public static void parseFile(String name) - throws IOException - { - DataInputStream dis = new DataInputStream(new FileInputStream(name)); - - byte bytes[] = new byte[dis.available()]; - dis.read(bytes); - - // simplistic test - int bom = (char) bytes[0] + (char) bytes[1]; - if (bom == 131069) - { - // FEFF or FFFE detected (either one sums to 131069) - Debug.out(name + " is already in Unicode!"); - return; - } - - // definitely assumes an ASCII file at this point - String text = new String(bytes, 0); - - char chars[] = parse(text).toCharArray(); - writeUnicodeFile(name, chars); - return; - } - - public static void encodeTest(String name) - throws Exception - { - String text = readUnicodeFile(name); - - // Create an instance of the compressor - Compress compressor = new Compress(); - - byte [] bytes = null; - - // perform compression - bytes = compressor.compress(text); - - int inlength = compressor.charsRead(); - int iDot = name.lastIndexOf('.'); - StringBuffer sb = new StringBuffer(name); - sb.setLength(iDot + 1); - sb.append("csu"); - String outName = sb.toString(); - - DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName)); - dos.write(bytes, 0, bytes.length); - - int outlength = compressor.bytesWritten(); - - Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%."); - } - - public static void roundtripTest(String name) - throws Exception - { - test(readUnicodeFile(name), false);// no debug info - } - - /** The Main function */ - public static void main(String args[]) - throws Exception - { - int iArg = args.length; - - try - { - if (iArg != 0) - { - if (args[0].equalsIgnoreCase("/compress")) - { - while (--iArg > 0) - { - encodeTest(args[args.length - iArg]); - } - } - else if (args[0].equalsIgnoreCase("/parse")) - { - while (--iArg > 0) - { - parseFile(args[args.length - iArg]); - } - } - else if (args[0].equalsIgnoreCase("/expand")) - { - while (--iArg > 0) - { - decodeTest(args[args.length - iArg]); - } - } - else if (args[0].equalsIgnoreCase("/display")) - { - while (--iArg > 0) - { - displayFile(args[args.length - iArg]); - } - } - else if (args[0].equalsIgnoreCase("/roundtrip")) - { - while (--iArg > 0) - { - roundtripTest(args[args.length - iArg]); - } - } - else if (args[0].equalsIgnoreCase("/byteswap")) - { - while (--iArg > 0) - { - byteswap(args[args.length - iArg]); - } - }else if (args[0].equalsIgnoreCase("/random")) - { - randomTest(8); - } - else if (args[0].equalsIgnoreCase("/suite")) - { - if (iArg == 1) - { - suiteTest(); - } - else - { - while (--iArg > 0) - { - fileTest(args[args.length - iArg]); - } - } - } - else if (args[0].equalsIgnoreCase("/?")) - { - usage(); - } - else - { - while (iArg > 0) - { - test2(parse(args[--iArg])); - } - } - } - else - { - usage(); - } - } - catch (IOException e) - { - System.err.println(e); - } - try - { - System.err.println("Done. Press enter to exit"); - System.in.read(); - } - catch (IOException e) - { - - } - } - - static void suiteTest() - throws Exception - { - Debug.out("Standard Compression test suite:"); - test("Hello \u9292 \u9192 World!"); - test("Hell\u0429o \u9292 \u9192 W\u00e4rld!"); - test("Hell\u0429o \u9292 \u9292W\u00e4rld!"); - - test("\u0648\u06c8"); // catch missing reset - test("\u0648\u06c8"); - - test("\u4444\uE001"); // lowest quotable - test("\u4444\uf2FF"); // highest quotable - test("\u4444\uf188\u4444"); - test("\u4444\uf188\uf288"); - test("\u4444\uf188abc\0429\uf288"); - test("\u9292\u2222"); - test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!"); - test("Hell\u0429o \u9292 \u9292W\u00e4rld!"); - test("Hello World!123456"); - test("Hello W\u0081\u011f\u0082!"); // Latin 1 run - - test("abc\u0301\u0302"); // uses SQn for u301 u302 - test("abc\u4411d"); // uses SQU - test("abc\u4411\u4412d");// uses SCU - test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5 - test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data - test("\u9292\u2222"); - test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); - test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c"); - test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002"); - - test(""); // empty input - test("\u0000"); // smallest BMP character - test("\uFFFF"); // largest BMP character - - test("\ud800\udc00"); // smallest surrogate - test("\ud8ff\udcff"); // largest surrogate pair - - - Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:"); - test("\ud800 \udc00", true); // unpaired surrogate (1) - test("\udc00", true); // unpaired surrogate (2) - test("\ud800", true); // unpaired surrogate (3) - } -} diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressTest.java b/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressTest.java deleted file mode 100644 index b9dc13a..0000000 --- a/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressTest.java +++ /dev/null @@ -1,47 +0,0 @@ -/* -Copyright (c) 2007 Health Market Science, Inc. - -This library is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License as published by the Free Software Foundation; either -version 2.1 of the License, or (at your option) any later version. - -This library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this library; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 -USA - -You can contact Health Market Science at info@healthmarketscience.com -or at the following address: - -Health Market Science -2700 Horizon Drive -Suite 200 -King of Prussia, PA 19406 -*/ - -package com.healthmarketscience.jackcess.impl.scsu; - -import junit.framework.TestCase; - -/** - * @author James Ahlborn - */ -public class CompressTest extends TestCase -{ - - public CompressTest(String name) throws Exception { - super(name); - } - - public void testCompression() throws Exception - { - CompressMain.suiteTest(); - } - -} -- 2.39.5