aboutsummaryrefslogtreecommitdiffstats
path: root/src/main
diff options
context:
space:
mode:
authorJames Ahlborn <jtahlborn@yahoo.com>2014-11-15 04:06:17 +0000
committerJames Ahlborn <jtahlborn@yahoo.com>2014-11-15 04:06:17 +0000
commita0c7c8dc6840d10fc1aa3d27881ed76434b15ecb (patch)
tree4011e3fadadf8e414d7b8ca2258d99880fbfd3b3 /src/main
parentdccc4c16210d2da624f88ae9bbdbb55ca3f3de14 (diff)
downloadjackcess-a0c7c8dc6840d10fc1aa3d27881ed76434b15ecb.tar.gz
jackcess-a0c7c8dc6840d10fc1aa3d27881ed76434b15ecb.zip
rework unicode compression support, fixes issue 111
git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@892 f203690c-595d-4dc9-a70b-905162fa7fd2
Diffstat (limited to 'src/main')
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java141
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java628
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java151
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java49
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java48
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java431
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java48
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java252
8 files changed, 63 insertions, 1685 deletions
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java b/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java
index 5e3fe88..224348a 100644
--- a/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java
+++ b/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java
@@ -60,10 +60,6 @@ import com.healthmarketscience.jackcess.complex.ComplexColumnInfo;
import com.healthmarketscience.jackcess.complex.ComplexValue;
import com.healthmarketscience.jackcess.complex.ComplexValueForeignKey;
import com.healthmarketscience.jackcess.impl.complex.ComplexValueForeignKeyImpl;
-import com.healthmarketscience.jackcess.impl.scsu.Compress;
-import com.healthmarketscience.jackcess.impl.scsu.EndOfInputException;
-import com.healthmarketscience.jackcess.impl.scsu.Expand;
-import com.healthmarketscience.jackcess.impl.scsu.IllegalInputException;
import com.healthmarketscience.jackcess.util.ColumnValidator;
import com.healthmarketscience.jackcess.util.SimpleColumnValidator;
import org.apache.commons.lang.builder.ToStringBuilder;
@@ -163,6 +159,8 @@ public class ColumnImpl implements Column, Comparable<ColumnImpl> {
/** header used to indicate unicode text compression */
private static final byte[] TEXT_COMPRESSION_HEADER =
{ (byte)0xFF, (byte)0XFE };
+ private static final char MIN_COMPRESS_CHAR = 1;
+ private static final char MAX_COMPRESS_CHAR = 0xFF;
/** owning table */
@@ -1110,57 +1108,44 @@ public class ColumnImpl implements Column, Comparable<ColumnImpl> {
String decodeTextValue(byte[] data)
throws IOException
{
- try {
-
- // see if data is compressed. the 0xFF, 0xFE sequence indicates that
- // compression is used (sort of, see algorithm below)
- boolean isCompressed = ((data.length > 1) &&
- (data[0] == TEXT_COMPRESSION_HEADER[0]) &&
- (data[1] == TEXT_COMPRESSION_HEADER[1]));
-
- if(isCompressed) {
+ // see if data is compressed. the 0xFF, 0xFE sequence indicates that
+ // compression is used (sort of, see algorithm below)
+ boolean isCompressed = ((data.length > 1) &&
+ (data[0] == TEXT_COMPRESSION_HEADER[0]) &&
+ (data[1] == TEXT_COMPRESSION_HEADER[1]));
- Expand expander = new Expand();
+ if(isCompressed) {
- // this is a whacky compression combo that switches back and forth
- // between compressed/uncompressed using a 0x00 byte (starting in
- // compressed mode)
- StringBuilder textBuf = new StringBuilder(data.length);
- // start after two bytes indicating compression use
- int dataStart = TEXT_COMPRESSION_HEADER.length;
- int dataEnd = dataStart;
- boolean inCompressedMode = true;
- while(dataEnd < data.length) {
- if(data[dataEnd] == (byte)0x00) {
-
- // handle current segment
- decodeTextSegment(data, dataStart, dataEnd, inCompressedMode,
- expander, textBuf);
- inCompressedMode = !inCompressedMode;
- ++dataEnd;
- dataStart = dataEnd;
+ // this is a whacky compression combo that switches back and forth
+ // between compressed/uncompressed using a 0x00 byte (starting in
+ // compressed mode)
+ StringBuilder textBuf = new StringBuilder(data.length);
+ // start after two bytes indicating compression use
+ int dataStart = TEXT_COMPRESSION_HEADER.length;
+ int dataEnd = dataStart;
+ boolean inCompressedMode = true;
+ while(dataEnd < data.length) {
+ if(data[dataEnd] == (byte)0x00) {
+
+ // handle current segment
+ decodeTextSegment(data, dataStart, dataEnd, inCompressedMode,
+ textBuf);
+ inCompressedMode = !inCompressedMode;
+ ++dataEnd;
+ dataStart = dataEnd;
- } else {
- ++dataEnd;
- }
+ } else {
+ ++dataEnd;
}
- // handle last segment
- decodeTextSegment(data, dataStart, dataEnd, inCompressedMode,
- expander, textBuf);
+ }
+ // handle last segment
+ decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, textBuf);
- return textBuf.toString();
+ return textBuf.toString();
- }
-
- return decodeUncompressedText(data, getCharset());
-
- } catch (IllegalInputException e) {
- throw (IOException)
- new IOException("Can't expand text column").initCause(e);
- } catch (EndOfInputException e) {
- throw (IOException)
- new IOException("Can't expand text column").initCause(e);
}
+
+ return decodeUncompressedText(data, getCharset());
}
/**
@@ -1168,25 +1153,29 @@ public class ColumnImpl implements Column, Comparable<ColumnImpl> {
* given status of the segment (compressed/uncompressed).
*/
private void decodeTextSegment(byte[] data, int dataStart, int dataEnd,
- boolean inCompressedMode, Expand expander,
+ boolean inCompressedMode,
StringBuilder textBuf)
- throws IllegalInputException, EndOfInputException
{
if(dataEnd <= dataStart) {
// no data
return;
}
int dataLength = dataEnd - dataStart;
+
if(inCompressedMode) {
- // handle compressed data
- byte[] tmpData = ByteUtil.copyOf(data, dataStart, dataLength);
- expander.reset();
- textBuf.append(expander.expand(tmpData));
- } else {
- // handle uncompressed data
- textBuf.append(decodeUncompressedText(data, dataStart, dataLength,
- getCharset()));
+ byte[] tmpData = new byte[dataLength * 2];
+ int tmpIdx = 0;
+ for(int i = dataStart; i < dataEnd; ++i) {
+ tmpData[tmpIdx] = data[i];
+ tmpIdx += 2;
+ }
+ data = tmpData;
+ dataStart = 0;
+ dataLength = data.length;
}
+
+ textBuf.append(decodeUncompressedText(data, dataStart, dataLength,
+ getCharset()));
}
/**
@@ -1215,41 +1204,37 @@ public class ColumnImpl implements Column, Comparable<ColumnImpl> {
// may only compress if column type allows it
if(!forceUncompressed && isCompressedUnicode() &&
- (text.length() <= getFormat().MAX_COMPRESSED_UNICODE_SIZE)) {
-
- // for now, only do very simple compression (only compress text which is
- // all ascii text)
- if(isAsciiCompressible(text)) {
-
- byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length +
- text.length()];
- encodedChars[0] = TEXT_COMPRESSION_HEADER[0];
- encodedChars[1] = TEXT_COMPRESSION_HEADER[1];
- for(int i = 0; i < text.length(); ++i) {
- encodedChars[i + TEXT_COMPRESSION_HEADER.length] =
- (byte)text.charAt(i);
- }
- return ByteBuffer.wrap(encodedChars);
+ (text.length() <= getFormat().MAX_COMPRESSED_UNICODE_SIZE) &&
+ isUnicodeCompressible(text)) {
+
+ byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length +
+ text.length()];
+ encodedChars[0] = TEXT_COMPRESSION_HEADER[0];
+ encodedChars[1] = TEXT_COMPRESSION_HEADER[1];
+ for(int i = 0; i < text.length(); ++i) {
+ encodedChars[i + TEXT_COMPRESSION_HEADER.length] =
+ (byte)text.charAt(i);
}
+ return ByteBuffer.wrap(encodedChars);
}
return encodeUncompressedText(text, getCharset());
}
/**
- * Returns {@code true} if the given text can be compressed using simple
- * ASCII encoding, {@code false} otherwise.
+ * Returns {@code true} if the given text can be compressed using compressed
+ * unicode, {@code false} otherwise.
*/
- private static boolean isAsciiCompressible(CharSequence text) {
+ private static boolean isUnicodeCompressible(CharSequence text) {
// only attempt to compress > 2 chars (compressing less than 3 chars would
// not result in a space savings due to the 2 byte compression header)
if(text.length() <= TEXT_COMPRESSION_HEADER.length) {
return false;
}
- // now, see if it is all printable ASCII
+ // now, see if it is all compressible characters
for(int i = 0; i < text.length(); ++i) {
char c = text.charAt(i);
- if(!Compress.isAsciiCrLfOrTab(c)) {
+ if((c < MIN_COMPRESS_CHAR) || (c > MAX_COMPRESS_CHAR)) {
return false;
}
}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java
deleted file mode 100644
index 9428075..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java
+++ /dev/null
@@ -1,628 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- * @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
-/**
- This class implements a simple compression algorithm
- **/
-/*
- Note on exception handling
- This compressor is designed so that it can be restarted after
- an exception. All operations advancing input and/or output cursor
- (iIn and iOut) either complete an action, or set a state (fUnicodeMode)
- before updating the cursors.
-*/
-public class Compress extends SCSU
-{
-
- /** next input character to be read **/
- private int iIn;
-
- /** next output byte to be written **/
- private int iOut;
-
- /** start index of Unicode mode in output array, or -1 if in single byte mode **/
- private int iSCU = -1;
-
- /** true if the next command byte is of the Uxx family */
- private boolean fUnicodeMode = false;
-
- /** locate a window for a character given a table of offsets
- @param ch - character
- @param offsetTable - table of window offsets
- @return true if the character fits a window from the table of windows */
- private boolean locateWindow(int ch, int[] offsetTable)
- {
- // always try the current window first
- int iWin = getCurrentWindow();
-
- // if the character fits the current window
- // just use the current window
- if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
- {
- return true;
- }
-
- // try all windows in order
- for (iWin = 0; iWin < offsetTable.length; iWin++)
- {
- if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
- {
- selectWindow(iWin);
- return true;
- }
- }
- // none found
- return false;
- }
-
- /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
- public static boolean isAsciiCrLfOrTab(int ch)
- {
- return (ch >= 0x20 && ch <= 0x7F) // ASCII
- || ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB
-
- }
-
- /** output a run of characters in single byte mode
- In single byte mode pass through characters in the ASCII range, but
- quote characters overlapping with compression command codes. Runs
- of characters fitting the current window are output as runs of bytes
- in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
- Uses and updates the current input and output cursors store in
- the instance variables <i>iIn</i> and <i>iOut</i>.
- @param in - input character array
- @param out - output byte array
- @return the next chaacter to be processed. This may be an extended character.
- **/
- @SuppressWarnings("fallthrough")
- public int outputSingleByteRun(char [] in, byte [] out)
- throws EndOfOutputException, EndOfInputException, IllegalInputException
- {
- int iWin = getCurrentWindow();
- while(iIn < in.length)
- {
- int outlen = 0;
- byte byte1 = 0;
- byte byte2 = 0;
-
- // get the input character
- int ch = in[iIn];
-
- int inlen = 1;
-
- // Check input for Surrogate pair
- if ( (ch & 0xF800) == 0xD800 )
- {
- if ( (ch & 0xFC00) == 0xDC00 )
- {
- // low surrogate out of order
- throw new IllegalInputException("Unpaired low surrogate: "+iIn);
- }
- else
- {
- // have high surrogate now get low surrogate
- if ( iIn >= in.length-1)
- {
- // premature end of input
- throw new EndOfInputException();
- }
- // get the char
- int ch2 = in[iIn+1];
-
- // make sure it's a low surrogate
- if ( (ch2 & 0xFC00) != 0xDC00 )
- {
- // a low surrogate was required
- throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1));
- }
-
- // combine the two values
- ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
- // ch = ch<<10 + ch2 - 0x36F0000;
-
- inlen = 2;
- }
- }
-
- // ASCII Letter, NUL, CR, LF and TAB are always passed through
- if (isAsciiCrLfOrTab(ch) || ch == 0)
- {
- // pass through directcly
- byte2 = (byte)(ch & 0x7F);
- outlen = 1;
- }
-
- // All other control codes must be quoted
- else if (ch < 0x20)
- {
- byte1 = SQ0;
- byte2 = (byte)(ch);
- outlen = 2;
- }
-
- // Letters that fit the current dynamic window
- else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
- {
- ch -= dynamicOffset[iWin];
- byte2 = (byte)(ch | 0x80);
- outlen = 1;
- }
-
- // check for room in the output array
- if (iOut + outlen >= out.length)
- {
- throw new EndOfOutputException();
- }
-
- switch(outlen)
- {
- default:
- // need to use some other compression mode for this
- // character so we terminate this loop
-
- return ch; // input not finished
-
- // output the characters
- case 2:
- out[iOut++] = byte1;
- // fall through
- case 1:
- out[iOut++] = byte2;
- break;
- }
- // advance input pointer
- iIn += inlen;
- }
- return 0; // input all used up
- }
-
- /** quote a single character in single byte mode
- Quoting a character (aka 'non-locking shift') gives efficient access
- to characters that occur in isolation--usually punctuation characters.
- When quoting a character from a dynamic window use 0x80 - 0xFF, when
- quoting a character from a static window use 0x00-0x7f.
- @param ch - character to be quoted
- @param out - output byte array
- **/
-
- private void quoteSingleByte(int ch, byte [] out)
- throws EndOfOutputException
- {
- Debug.out("Quoting SingleByte ", ch);
- int iWin = getCurrentWindow();
-
- // check for room in the output array
- if (iOut >= out.length -2)
- {
- throw new EndOfOutputException();
- }
-
- // Output command byte followed by
- out[iOut++] = (byte)(SQ0 + iWin);
-
- // Letter that fits the current dynamic window
- if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
- {
- ch -= dynamicOffset[iWin];
- out[iOut++] = (byte)(ch | 0x80);
- }
-
- // Letter that fits the current static window
- else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80)
- {
- ch -= staticOffset[iWin];
- out[iOut++] = (byte)ch;
- }
- else
- {
- throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error");
- }
- // advance input pointer
- iIn ++;
- Debug.out("New input: ", iIn);
- }
-
- /** output a run of characters in Unicode mode
- A run of Unicode mode consists of characters which are all in the
- range of non-compressible characters or isolated occurrence
- of any other characters. Characters in the range 0xE00-0xF2FF must
- be quoted to avoid overlap with the Unicode mode compression command codes.
- Uses and updates the current input and output cursors store in
- the instance variables <i>iIn</i> and <i>iOut</i>.
- NOTE: Characters from surrogate pairs are passed through and unlike single
- byte mode no checks are made for unpaired surrogate characters.
- @param in - input character array
- @param out - output byte array
- @return the next input character to be processed
- **/
- public char outputUnicodeRun(char [] in, byte [] out)
- throws EndOfOutputException
- {
- // current character
- char ch = 0;
-
- while(iIn < in.length)
- {
- // get current input and set default output length
- ch = in[iIn];
- int outlen = 2;
-
- // Characters in these ranges could potentially be compressed.
- // We require 2 or more compressible characters to break the run
- if (isCompressible(ch))
- {
- // check whether we can look ahead
- if( iIn < in.length - 1)
- {
- // DEBUG
- Debug.out("is-comp: ",ch);
- char ch2 = in[iIn + 1];
- if (isCompressible(ch2))
- {
- // at least 2 characters are compressible
- // break the run
- break;
- }
- //DEBUG
- Debug.out("no-comp: ",ch2);
- }
- // If we get here, the current character is only character
- // left in the input or it is followed by a non-compressible
- // character. In neither case do we gain by breaking the
- // run, so we proceed to output the character.
- if (ch >= 0xE000 && ch <= 0xF2FF)
- {
- // Characters in this range need to be escaped
- outlen = 3;
- }
-
- }
- // check that there is enough room to output the character
- if(iOut >= out.length - outlen)
- {
- // DEBUG
- Debug.out("End of Output @", iOut);
- // if we got here, we ran out of space in the output array
- throw new EndOfOutputException();
- }
-
- // output any characters that cannot be compressed,
- if (outlen == 3)
- {
- // output the quote character
- out[iOut++] = UQU;
- }
- // pass the Unicode character in MSB,LSB order
- out[iOut++] = (byte)(ch >>> 8);
- out[iOut++] = (byte)(ch & 0xFF);
-
- // advance input cursor
- iIn++;
- }
-
- // return the last character
- return ch;
- }
-
- static int iNextWindow = 3;
-
- /** redefine a window so it surrounds a given character value
- For now, this function uses window 3 exclusively (window 4
- for extended windows);
- @return true if a window was successfully defined
- @param ch - character around which window is positioned
- @param out - output byte array
- @param fCurUnicodeMode - type of window
- **/
- private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode)
- throws IllegalInputException, EndOfOutputException
- {
- int iWin = iNextWindow % 8; // simple LRU
- int iPosition = 0;
-
- // iPosition 0 is a reserved value
- if (ch < 0x80)
- {
- throw new IllegalStateException("ch < 0x80");
- //return false;
- }
-
- // Check the fixed offsets
- for (int i = 0; i < fixedOffset.length; i++)
- {
- if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80)
- {
- iPosition = i;
- break;
- }
- }
-
- if (iPosition != 0)
- {
- // DEBUG
- Debug.out("FIXED position is ", iPosition + 0xF9);
-
- // ch fits in a fixed offset window position
- dynamicOffset[iWin] = fixedOffset[iPosition];
- iPosition += 0xF9;
- }
- else if (ch < 0x3400)
- {
- // calculate a window position command and set the offset
- iPosition = ch >>> 7;
- dynamicOffset[iWin] = ch & 0xFF80;
-
- Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
- }
- else if (ch < 0xE000)
- {
- // attempt to place a window where none can go
- return false;
- }
- else if (ch <= 0xFFFF)
- {
- // calculate a window position command, accounting
- // for the gap in position values, and set the offset
- iPosition = ((ch - gapOffset)>>> 7);
-
- dynamicOffset[iWin] = ch & 0xFF80;
-
- Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
- }
- else
- {
- // if we get here, the character is in the extended range.
- // Always use Window 4 to define an extended window
-
- iPosition = (ch - 0x10000) >>> 7;
- // DEBUG
- Debug.out("Try position Window at ", iPosition);
-
- iPosition |= iWin << 13;
- dynamicOffset[iWin] = ch & 0x1FFF80;
- }
-
- // Outputting window defintion command for the general cases
- if ( iPosition < 0x100 && iOut < out.length-1)
- {
- out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin);
- out[iOut++] = (byte) (iPosition & 0xFF);
- }
- // Output an extended window definiton command
- else if ( iPosition >= 0x100 && iOut < out.length - 2)
- {
-
- Debug.out("Setting extended window at ", iPosition);
- out[iOut++] = (fCurUnicodeMode ? UDX : SDX);
- out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
- out[iOut++] = (byte) (iPosition & 0xFF);
- }
- else
- {
- throw new EndOfOutputException();
- }
- selectWindow(iWin);
- iNextWindow++;
- return true;
- }
-
- /**
- compress a Unicode character array with some simplifying assumptions
- **/
- public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut)
- throws IllegalInputException, EndOfInputException, EndOfOutputException
- {
- iIn = iStartIn;
- iOut = iStartOut;
-
-
- while (iIn < in.length)
- {
- int ch;
-
- // previously we switched to a Unicode run
- if (iSCU != -1)
- {
-
- Debug.out("Remaining", in, iIn);
- Debug.out("Output until ["+iOut+"]: ", out);
-
- // output characters as Unicode
- ch = outputUnicodeRun(in, out);
-
- // for single character Unicode runs (3 bytes) use quote
- if (iOut - iSCU == 3 )
- {
- // go back and fix up the SCU to an SQU instead
- out[iSCU] = SQU;
- iSCU = -1;
- continue;
- }
- else
- {
- iSCU = -1;
- fUnicodeMode = true;
- }
- }
- // next, try to output characters as single byte run
- else
- {
- ch = outputSingleByteRun(in, out);
- }
-
- // check whether we still have input
- if (iIn == in.length)
- {
- break; // no more input
- }
-
- // if we get here, we have a consistent value for ch, whether or
- // not it is an regular or extended character. Locate or define a
- // Window for the current character
-
- Debug.out("Output so far: ", out);
- Debug.out("Routing ch="+ch+" for Input", in, iIn);
-
- // Check that we have enough room to output the command byte
- if (iOut >= out.length - 1)
- {
- throw new EndOfOutputException();
- }
-
- // In order to switch away from Unicode mode, it is necessary
- // to select (or define) a window. If the characters that follow
- // the Unicode range are ASCII characters, we can't use them
- // to decide which window to select, since ASCII characters don't
- // influence window settings. This loop looks ahead until it finds
- // one compressible character that isn't in the ASCII range.
- for (int ich = iIn; ch < 0x80; ich++)
- {
- if (ich == in.length || !isCompressible(in[ich]))
- {
- // if there are only ASCII characters left,
- ch = in[iIn];
- break;
- }
- ch = in[ich]; // lookahead for next non-ASCII char
- }
- // The character value contained in ch here will only be used to select
- // output modes. Actual output of characters starts with in[iIn] and
- // only takes place near the top of the loop.
-
- int iprevWindow = getCurrentWindow();
-
- // try to locate a dynamic window
- if (ch < 0x80 || locateWindow(ch, dynamicOffset))
- {
- Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1);
- // lookahead to use SQn instead of SCn for single
- // character interruptions of runs in current window
- if(!fUnicodeMode && iIn < in.length -1)
- {
- char ch2 = in[iIn+1];
- if (ch2 >= dynamicOffset[iprevWindow] &&
- ch2 < dynamicOffset[iprevWindow] + 0x80)
- {
- quoteSingleByte(ch, out);
- selectWindow(iprevWindow);
- continue;
- }
- }
-
- out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
- fUnicodeMode = false;
- }
- // try to locate a static window
- else if (!fUnicodeMode && locateWindow(ch, staticOffset))
- {
- // static windows are not accessible from Unicode mode
- Debug.out("located a static window", getCurrentWindow());
- quoteSingleByte(ch, out);
- selectWindow(iprevWindow); // restore current Window settings
- continue;
- }
- // try to define a window around ch
- else if (positionWindow(ch, out, fUnicodeMode) )
- {
- fUnicodeMode = false;
- }
- // If all else fails, start a Unicode run
- else
- {
- iSCU = iOut;
- out[iOut++] = SCU;
- continue;
- }
- }
-
- return iOut - iStartOut;
- }
-
- public byte[] compress(String inStr)
- throws IllegalInputException, EndOfInputException
- {
- // Running out of room for output can cause non-optimal
- // compression. In order to not slow down compression too
- // much, not all intermediate state is constantly saved.
-
- byte [] out = new byte[inStr.length() * 2];
- char [] in = inStr.toCharArray();
- //DEBUG
- Debug.out("compress input: ",in);
- reset();
- while(true)
- {
- try
- {
- simpleCompress(in, charsRead(), out, bytesWritten());
- // if we get here things went fine.
- break;
- }
- catch (EndOfOutputException e)
- {
- // create a larger output buffer and continue
- byte [] largerOut = new byte[out.length * 2];
- System.arraycopy(out, 0, largerOut, 0, out.length);
- out = largerOut;
- }
- }
- byte [] trimmedOut = new byte[bytesWritten()];
- System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
- out = trimmedOut;
-
- Debug.out("compress output: ", out);
- return out;
- }
-
- /** reset is only needed to bail out after an exception and
- restart with new input */
- @Override
- public void reset()
- {
- super.reset();
- fUnicodeMode = false;
- iSCU = - 1;
- }
-
- /** returns the number of bytes written **/
- public int bytesWritten()
- {
- return iOut;
- }
-
- /** returns the number of bytes written **/
- public int charsRead()
- {
- return iIn;
- }
-
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java
deleted file mode 100644
index c973765..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java
+++ /dev/null
@@ -1,151 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-/*
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- * @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
-/**
- * A number of helpful output routines for debugging. Output can be
- * centrally enabled or disabled by calling Debug.set(true/false);
- * All methods are statics;
- */
-
-public class Debug
-{
-
- private static final Log LOG = LogFactory.getLog(Debug.class);
-
- // debugging helper
- public static void out(char [] chars)
- {
- out(chars, 0);
- }
-
- public static void out(char [] chars, int iStart)
- {
- if (!LOG.isDebugEnabled()) return;
- StringBuilder msg = new StringBuilder();
-
- for (int i = iStart; i < chars.length; i++)
- {
- if (chars[i] >= 0 && chars[i] <= 26)
- {
- msg.append("^"+(char)(chars[i]+0x40));
- }
- else if (chars[i] <= 255)
- {
- msg.append(chars[i]);
- }
- else
- {
- msg.append("\\u"+Integer.toString(chars[i],16));
- }
- }
- LOG.debug(msg.toString());
- }
-
- public static void out(byte [] bytes)
- {
- out(bytes, 0);
- }
- public static void out(byte [] bytes, int iStart)
- {
- if (!LOG.isDebugEnabled()) return;
- StringBuilder msg = new StringBuilder();
-
- for (int i = iStart; i < bytes.length; i++)
- {
- msg.append(bytes[i]+",");
- }
- LOG.debug(msg.toString());
- }
-
- public static void out(String str)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(str);
- }
-
- public static void out(String msg, int iData)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(msg + iData);
- }
- public static void out(String msg, char ch)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(msg + "[U+"+Integer.toString(ch,16)+"]" + ch);
- }
- public static void out(String msg, byte bData)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(msg + bData);
- }
- public static void out(String msg, String str)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(msg + str);
- }
- public static void out(String msg, char [] data)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(msg);
- out(data);
- }
- public static void out(String msg, byte [] data)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(msg);
- out(data);
- }
- public static void out(String msg, char [] data, int iStart)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(msg +"("+iStart+"): ");
- out(data, iStart);
- }
- public static void out(String msg, byte [] data, int iStart)
- {
- if (!LOG.isDebugEnabled()) return;
-
- LOG.debug(msg+"("+iStart+"): ");
- out(data, iStart);
- }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java
deleted file mode 100644
index b3148a7..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java
+++ /dev/null
@@ -1,49 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- * @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-/**
- * The input string or input byte array ended prematurely
- *
- */
-public class EndOfInputException
- extends java.lang.Exception
-{
-
- private static final long serialVersionUID = 1L;
-
- public EndOfInputException(){
- super("The input string or input byte array ended prematurely");
- }
-
- public EndOfInputException(String s) {
- super(s);
- }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java
deleted file mode 100644
index 94f5be6..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-/**
- * The input string or input byte array ended prematurely
- */
-public class EndOfOutputException
- extends java.lang.Exception
-
-{
-
- private static final long serialVersionUID = 1L;
-
- public EndOfOutputException(){
- super("The input string or input byte array ended prematurely");
- }
-
- public EndOfOutputException(String s) {
- super(s);
- }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java
deleted file mode 100644
index 378ca2f..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java
+++ /dev/null
@@ -1,431 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/*
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- * @version 004 Aug 25 1997
- * @version 005 Sep 30 1998
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
- /**
- Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
-
- <H2>Notes on the Java implementation</H2>
-
- A limitation of Java is the exclusive use of a signed byte data type.
- The following work arounds are required:
-
- Copying a byte to an integer variable and adding 256 for 'negative'
- bytes gives an integer in the range 0-255.
-
- Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
- char values is unsigned.
-
- Extended characters require an int to store them. The sign is not an
- issue because only 1024*1024 + 65536 extended characters exist.
-
-**/
-public class Expand extends SCSU
-{
- /** (re-)define (and select) a dynamic window
- A sliding window position cannot start at any Unicode value,
- so rather than providing an absolute offset, this function takes
- an index value which selects among the possible starting values.
-
- Most scripts in Unicode start on or near a half-block boundary
- so the default behaviour is to multiply the index by 0x80. Han,
- Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
- show very poor locality--therefore no sliding window can be set
- there. A jumpOffset is added to the index value to skip that region,
- and only 167 index values total are required to select all eligible
- half-blocks.
-
- Finally, a few scripts straddle half block boundaries. For them, a
- table of fixed offsets is used, and the index values from 0xF9 to
- 0xFF are used to select these special offsets.
-
- After (re-)defining a windows location it is selected so it is ready
- for use.
-
- Recall that all Windows are of the same length (128 code positions).
-
- @param iWindow - index of the window to be (re-)defined
- @param bOffset - index for the new offset value
- **/
- // @005 protected <-- private here and elsewhere
- protected void defineWindow(int iWindow, byte bOffset)
- throws IllegalInputException
- {
- int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
-
- // 0 is a reserved value
- if (iOffset == 0)
- {
- throw new IllegalInputException();
- }
- else if (iOffset < gapThreshold)
- {
- dynamicOffset[iWindow] = iOffset << 7;
- }
- else if (iOffset < reservedStart)
- {
- dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
- }
- else if (iOffset < fixedThreshold)
- {
- // more reserved values
- throw new IllegalInputException("iOffset == "+iOffset);
- }
- else
- {
- dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
- }
-
- // make the redefined window the active one
- selectWindow(iWindow);
- }
-
- /** (re-)define (and select) a window as an extended dynamic window
- The surrogate area in Unicode allows access to 2**20 codes beyond the
- first 64K codes by combining one of 1024 characters from the High
- Surrogate Area with one of 1024 characters from the Low Surrogate
- Area (see Unicode 2.0 for the details).
-
- The tags SDX and UDX set the window such that each subsequent byte in
- the range 80 to FF represents a surrogate pair. The following diagram
- shows how the bits in the two bytes following the SDX or UDX, and a
- subsequent data byte, map onto the bits in the resulting surrogate pair.
-
- hbyte lbyte data
- nnnwwwww zzzzzyyy 1xxxxxxx
-
- high-surrogate low-surrogate
- 110110wwwwwzzzzz 110111yyyxxxxxxx
-
- @param chOffset - Since the three top bits of chOffset are not needed to
- set the location of the extended Window, they are used instead
- to select the window, thereby reducing the number of needed command codes.
- The bottom 13 bits of chOffset are used to calculate the offset relative to
- a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
- **/
- protected void defineExtendedWindow(char chOffset)
- {
- // The top 3 bits of iOffsetHi are the window index
- int iWindow = chOffset >>> 13;
-
- // Calculate the new offset
- dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
-
- // make the redefined window the active one
- selectWindow(iWindow);
- }
-
- /** string buffer length used by the following functions */
- protected int iOut = 0;
-
- /** input cursor used by the following functions */
- protected int iIn = 0;
-
- /** expand input that is in Unicode mode
- @param in input byte array to be expanded
- @param iCur starting index
- @param sb string buffer to which to append expanded input
- @return the index for the lastc byte processed
- **/
- protected int expandUnicode(byte []in, int iCur, StringBuilder sb)
- throws IllegalInputException, EndOfInputException
- {
- for( ; iCur < in.length-1; iCur+=2 ) // step by 2:
- {
- byte b = in[iCur];
-
- if (b >= UC0 && b <= UC7)
- {
- Debug.out("SelectWindow: ", b);
- selectWindow(b - UC0);
- return iCur;
- }
- else if (b >= UD0 && b <= UD7)
- {
- defineWindow( b - UD0, in[iCur+1]);
- return iCur + 1;
- }
- else if (b == UDX)
- {
- if( iCur >= in.length - 2)
- {
- break; // buffer error
- }
- defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2]));
- return iCur + 2;
- }
- else if (b == UQU)
- {
- if( iCur >= in.length - 2)
- {
- break; // error
- }
- // Skip command byte and output Unicode character
- iCur++;
- }
-
- // output a Unicode character
- char ch = charFromTwoBytes(in[iCur], in[iCur+1]);
- sb.append(ch);
- iOut++;
- }
-
- if( iCur == in.length)
- {
- return iCur;
- }
-
- // Error condition
- throw new EndOfInputException();
- }
-
- /** assemble a char from two bytes
- In Java bytes are signed quantities, while chars are unsigned
- @return the character
- @param hi most significant byte
- @param lo least significant byte
- */
- public static char charFromTwoBytes(byte hi, byte lo)
- {
- char ch = (char)(lo >= 0 ? lo : 256 + lo);
- return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8));
- }
-
- /** expand portion of the input that is in single byte mode **/
- @SuppressWarnings("fallthrough")
- protected String expandSingleByte(byte []in)
- throws IllegalInputException, EndOfInputException
- {
-
- /* Allocate the output buffer. Because of control codes, generally
- each byte of input results in fewer than one character of
- output. Using in.length as an intial allocation length should avoid
- the need to reallocate in mid-stream. The exception to this rule are
- surrogates. */
- StringBuilder sb = new StringBuilder(in.length);
- iOut = 0;
-
- // Loop until all input is exhausted or an error occurred
- int iCur;
- Loop:
- for( iCur = 0; iCur < in.length; iCur++ )
- {
- // DEBUG Debug.out("Expanding: ", iCur);
-
- // Default behaviour is that ASCII characters are passed through
- // (staticOffset[0] == 0) and characters with the high bit on are
- // offset by the current dynamic (or sliding) window (this.iWindow)
- int iStaticWindow = 0;
- int iDynamicWindow = getCurrentWindow();
-
- switch(in[iCur])
- {
- // Quote from a static Window
- case SQ0:
- case SQ1:
- case SQ2:
- case SQ3:
- case SQ4:
- case SQ5:
- case SQ6:
- case SQ7:
- Debug.out("SQn:", iStaticWindow);
- // skip the command byte and check for length
- if( iCur >= in.length - 1)
- {
- Debug.out("SQn missing argument: ", in, iCur);
- break Loop; // buffer length error
- }
- // Select window pair to quote from
- iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
- iCur ++;
-
- // FALL THROUGH
-
- default:
- // output as character
- if(in[iCur] >= 0)
- {
- // use static window
- int ch = in[iCur] + staticOffset[iStaticWindow];
- sb.append((char)ch);
- iOut++;
- }
- else
- {
- // use dynamic window
- int ch = (in[iCur] + 256); // adjust for signed bytes
- ch -= 0x80; // reduce to range 00..7F
- ch += dynamicOffset[iDynamicWindow];
-
- //DEBUG
- Debug.out("Dynamic: ", (char) ch);
-
- if (ch < 1<<16)
- {
- // in Unicode range, output directly
- sb.append((char)ch);
- iOut++;
- }
- else
- {
- // this is an extension character
- Debug.out("Extension character: ", ch);
-
- // compute and append the two surrogates:
- // translate from 10000..10FFFF to 0..FFFFF
- ch -= 0x10000;
-
- // high surrogate = top 10 bits added to D800
- sb.append((char)(0xD800 + (ch>>10)));
- iOut++;
-
- // low surrogate = bottom 10 bits added to DC00
- sb.append((char)(0xDC00 + (ch & ~0xFC00)));
- iOut++;
- }
- }
- break;
-
- // define a dynamic window as extended
- case SDX:
- iCur += 2;
- if( iCur >= in.length)
- {
- Debug.out("SDn missing argument: ", in, iCur -1);
- break Loop; // buffer length error
- }
- defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur]));
- break;
-
- // Position a dynamic Window
- case SD0:
- case SD1:
- case SD2:
- case SD3:
- case SD4:
- case SD5:
- case SD6:
- case SD7:
- iCur ++;
- if( iCur >= in.length)
- {
- Debug.out("SDn missing argument: ", in, iCur -1);
- break Loop; // buffer length error
- }
- defineWindow(in[iCur-1] - SD0, in[iCur]);
- break;
-
- // Select a new dynamic Window
- case SC0:
- case SC1:
- case SC2:
- case SC3:
- case SC4:
- case SC5:
- case SC6:
- case SC7:
- selectWindow(in[iCur] - SC0);
- break;
- case SCU:
- // switch to Unicode mode and continue parsing
- iCur = expandUnicode(in, iCur+1, sb);
- // DEBUG Debug.out("Expanded Unicode range until: ", iCur);
- break;
-
- case SQU:
- // directly extract one Unicode character
- iCur += 2;
- if( iCur >= in.length)
- {
- Debug.out("SQU missing argument: ", in, iCur - 2);
- break Loop; // buffer length error
- }
- else
- {
- char ch = charFromTwoBytes(in[iCur-1], in[iCur]);
-
- Debug.out("Quoted: ", ch);
- sb.append(ch);
- iOut++;
- }
- break;
-
- case Srs:
- throw new IllegalInputException();
- // break;
- }
- }
-
- if( iCur >= in.length)
- {
- //SUCCESS: all input used up
- sb.setLength(iOut);
- iIn = iCur;
- return sb.toString();
- }
-
- Debug.out("Length ==" + in.length+" iCur =", iCur);
- //ERROR: premature end of input
- throw new EndOfInputException();
- }
-
- /** expand a byte array containing compressed Unicode */
- public String expand (byte []in)
- throws IllegalInputException, EndOfInputException
- {
- String str = expandSingleByte(in);
- Debug.out("expand output: ", str.toCharArray());
- return str;
- }
-
-
- /** reset is called to start with new input, w/o creating a new
- instance */
- @Override
- public void reset()
- {
- iOut = 0;
- iIn = 0;
- super.reset();
- }
-
- public int charsWritten()
- {
- return iOut;
- }
-
- public int bytesRead()
- {
- return iIn;
- }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java
deleted file mode 100644
index b191f56..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- * @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-/**
- * The input character array or input byte array contained
- * illegal sequences of bytes or characters
- */
-public class IllegalInputException extends java.lang.Exception
-{
-
- private static final long serialVersionUID = 1L;
-
- public IllegalInputException(){
- super("The input character array or input byte array contained illegal sequences of bytes or characters");
- }
-
- public IllegalInputException(String s) {
- super(s);
- }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java
deleted file mode 100644
index 7859780..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java
+++ /dev/null
@@ -1,252 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/*
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- * @version 004 Aug 25 1997
- * @version 005 Sep 30 1998
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
- /**
- Encoding text data in Unicode often requires more storage than using
- an existing 8-bit character set and limited to the subset of characters
- actually found in the text. The Unicode Compression Algorithm reduces
- the necessary storage while retaining the universality of Unicode.
- A full description of the algorithm can be found in document
- http://www.unicode.org/unicode/reports/tr6.html
-
- Summary
-
- The goal of the Unicode Compression Algorithm is the abilty to
- * Express all code points in Unicode
- * Approximate storage size for traditional character sets
- * Work well for short strings
- * Provide transparency for Latin-1 data
- * Support very simple decoders
- * Support simple as well as sophisticated encoders
-
- If needed, further compression can be achieved by layering standard
- file or disk-block based compression algorithms on top.
-
- <H2>Features</H2>
-
- Languages using small alphabets would contain runs of characters that
- are coded close together in Unicode. These runs are interrupted only
- by punctuation characters, which are themselves coded in proximity to
- each other in Unicode (usually in the ASCII range).
-
- Two basic mechanisms in the compression algorithm account for these two
- cases, sliding windows and static windows. A window is an area of 128
- consecutive characters in Unicode. In the compressed data stream, each
- character from a sliding window would be represented as a byte between
- 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
- TAB) would always mean an ASCII character (or control).
-
- <H2>Notes on the Java implementation</H2>
-
- A limitation of Java is the exclusive use of a signed byte data type.
- The following work arounds are required:
-
- Copying a byte to an integer variable and adding 256 for 'negative'
- bytes gives an integer in the range 0-255.
-
- Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
- char values is unsigned.
-
- Extended characters require an int to store them. The sign is not an
- issue because only 1024*1024 + 65536 extended characters exist.
-
-**/
-public abstract class SCSU
-{
- /** Single Byte mode command values */
-
- /** SQ<i>n</i> Quote from Window . <p>
- If the following byte is less than 0x80, quote from
- static window <i>n</i>, else quote from dynamic window <i>n</i>.
- */
-
- static final byte SQ0 = 0x01; // Quote from window pair 0
- static final byte SQ1 = 0x02; // Quote from window pair 1
- static final byte SQ2 = 0x03; // Quote from window pair 2
- static final byte SQ3 = 0x04; // Quote from window pair 3
- static final byte SQ4 = 0x05; // Quote from window pair 4
- static final byte SQ5 = 0x06; // Quote from window pair 5
- static final byte SQ6 = 0x07; // Quote from window pair 6
- static final byte SQ7 = 0x08; // Quote from window pair 7
-
- static final byte SDX = 0x0B; // Define a window as extended
- static final byte Srs = 0x0C; // reserved
-
- static final byte SQU = 0x0E; // Quote a single Unicode character
- static final byte SCU = 0x0F; // Change to Unicode mode
-
- /** SC<i>n</i> Change to Window <i>n</i>. <p>
- If the following bytes are less than 0x80, interpret them
- as command bytes or pass them through, else add the offset
- for dynamic window <i>n</i>. */
- static final byte SC0 = 0x10; // Select window 0
- static final byte SC1 = 0x11; // Select window 1
- static final byte SC2 = 0x12; // Select window 2
- static final byte SC3 = 0x13; // Select window 3
- static final byte SC4 = 0x14; // Select window 4
- static final byte SC5 = 0x15; // Select window 5
- static final byte SC6 = 0x16; // Select window 6
- static final byte SC7 = 0x17; // Select window 7
- static final byte SD0 = 0x18; // Define and select window 0
- static final byte SD1 = 0x19; // Define and select window 1
- static final byte SD2 = 0x1A; // Define and select window 2
- static final byte SD3 = 0x1B; // Define and select window 3
- static final byte SD4 = 0x1C; // Define and select window 4
- static final byte SD5 = 0x1D; // Define and select window 5
- static final byte SD6 = 0x1E; // Define and select window 6
- static final byte SD7 = 0x1F; // Define and select window 7
-
- static final byte UC0 = (byte) 0xE0; // Select window 0
- static final byte UC1 = (byte) 0xE1; // Select window 1
- static final byte UC2 = (byte) 0xE2; // Select window 2
- static final byte UC3 = (byte) 0xE3; // Select window 3
- static final byte UC4 = (byte) 0xE4; // Select window 4
- static final byte UC5 = (byte) 0xE5; // Select window 5
- static final byte UC6 = (byte) 0xE6; // Select window 6
- static final byte UC7 = (byte) 0xE7; // Select window 7
- static final byte UD0 = (byte) 0xE8; // Define and select window 0
- static final byte UD1 = (byte) 0xE9; // Define and select window 1
- static final byte UD2 = (byte) 0xEA; // Define and select window 2
- static final byte UD3 = (byte) 0xEB; // Define and select window 3
- static final byte UD4 = (byte) 0xEC; // Define and select window 4
- static final byte UD5 = (byte) 0xED; // Define and select window 5
- static final byte UD6 = (byte) 0xEE; // Define and select window 6
- static final byte UD7 = (byte) 0xEF; // Define and select window 7
-
- static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
- static final byte UDX = (byte) 0xF1; // Define a Window as extended
- static final byte Urs = (byte) 0xF2; // reserved
-
- /** constant offsets for the 8 static windows */
- static final int staticOffset[] =
- {
- 0x0000, // ASCII for quoted tags
- 0x0080, // Latin - 1 Supplement (for access to punctuation)
- 0x0100, // Latin Extended-A
- 0x0300, // Combining Diacritical Marks
- 0x2000, // General Punctuation
- 0x2080, // Currency Symbols
- 0x2100, // Letterlike Symbols and Number Forms
- 0x3000 // CJK Symbols and punctuation
- };
-
- /** initial offsets for the 8 dynamic (sliding) windows */
- static final int initialDynamicOffset[] =
- {
- 0x0080, // Latin-1
- 0x00C0, // Latin Extended A //@005 fixed from 0x0100
- 0x0400, // Cyrillic
- 0x0600, // Arabic
- 0x0900, // Devanagari
- 0x3040, // Hiragana
- 0x30A0, // Katakana
- 0xFF00 // Fullwidth ASCII
- };
-
- /** dynamic window offsets, intitialize to default values. */
- int dynamicOffset[] =
- {
- initialDynamicOffset[0],
- initialDynamicOffset[1],
- initialDynamicOffset[2],
- initialDynamicOffset[3],
- initialDynamicOffset[4],
- initialDynamicOffset[5],
- initialDynamicOffset[6],
- initialDynamicOffset[7]
- };
-
- // The following method is common to encoder and decoder
-
- private int iWindow = 0; // current active window
-
- /** select the active dynamic window **/
- protected void selectWindow(int iWindow)
- {
- this.iWindow = iWindow;
- }
-
- /** select the active dynamic window **/
- protected int getCurrentWindow()
- {
- return this.iWindow;
- }
-
- /**
- These values are used in defineWindow
- **/
-
- /**
- * Unicode code points from 3400 to E000 are not adressible by
- * dynamic window, since in these areas no short run alphabets are
- * found. Therefore add gapOffset to all values from gapThreshold */
- static final int gapThreshold = 0x68;
- static final int gapOffset = 0xAC00;
-
- /* values between reservedStart and fixedThreshold are reserved */
- static final int reservedStart = 0xA8;
-
- /* use table of predefined fixed offsets for values from fixedThreshold */
- static final int fixedThreshold = 0xF9;
-
- /** Table of fixed predefined Offsets, and byte values that index into **/
- static final int fixedOffset[] =
- {
- /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
- /* 0xFA */ 0x0250, // IPA extensions
- /* 0xFB */ 0x0370, // Greek
- /* 0xFC */ 0x0530, // Armenian
- /* 0xFD */ 0x3040, // Hiragana
- /* 0xFE */ 0x30A0, // Katakana
- /* 0xFF */ 0xFF60 // Halfwidth Katakana
- };
-
- /** whether a character is compressible */
- public static boolean isCompressible(char ch)
- {
- return (ch < 0x3400 || ch >= 0xE000);
- }
-
- /** reset is only needed to bail out after an exception and
- restart with new input */
- public void reset()
- {
-
- // reset the dynamic windows
- for (int i = 0; i < dynamicOffset.length; i++)
- {
- dynamicOffset[i] = initialDynamicOffset[i];
- }
- this.iWindow = 0;
- }
-}