From 4af4fe44512c2c885498794c7a5a6e878318b2ef Mon Sep 17 00:00:00 2001 From: James Ahlborn Date: Tue, 22 Jul 2008 19:00:25 +0000 Subject: [PATCH] Add compression code for possible future use; add compression unit tests. git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@365 f203690c-595d-4dc9-a70b-905162fa7fd2 --- src/changes/changes.xml | 4 + .../healthmarketscience/jackcess/Column.java | 13 +- .../jackcess/scsu/Compress.java | 628 ++++++++++++++++++ .../jackcess/scsu/EndOfOutputException.java | 48 ++ .../jackcess/scsu/Expand.java | 1 + .../jackcess/scsu/CompressMain.java | 574 ++++++++++++++++ .../jackcess/scsu/CompressTest.java | 47 ++ 7 files changed, 1304 insertions(+), 11 deletions(-) create mode 100644 src/java/com/healthmarketscience/jackcess/scsu/Compress.java create mode 100644 src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java create mode 100644 test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java create mode 100644 test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java diff --git a/src/changes/changes.xml b/src/changes/changes.xml index cef026e..be4bedf 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -25,6 +25,10 @@ Add primitive support for writing unicode compressed text columns. + + Add compression code for possible future use; add compression unit + tests. + diff --git a/src/java/com/healthmarketscience/jackcess/Column.java b/src/java/com/healthmarketscience/jackcess/Column.java index 8d27ae4..570d104 100644 --- a/src/java/com/healthmarketscience/jackcess/Column.java +++ b/src/java/com/healthmarketscience/jackcess/Column.java @@ -41,6 +41,7 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.healthmarketscience.jackcess.scsu.Compress; import com.healthmarketscience.jackcess.scsu.EndOfInputException; import com.healthmarketscience.jackcess.scsu.Expand; import com.healthmarketscience.jackcess.scsu.IllegalInputException; @@ -1196,23 +1197,13 @@ public class Column implements Comparable { // now, see if it is all printable ASCII for(int i = 0; i < text.length(); ++i) { char c = text.charAt(i); - if(!isAsciiCrLfOrTab(c)) { + if(!Compress.isAsciiCrLfOrTab(c)) { return false; } } return true; } - /** - * Returns true if the character is ASCII, but not a control other than - * CR, LF and TAB - */ - private static boolean isAsciiCrLfOrTab(int ch) - { - return ((ch >= 0x20 && ch <= 0x7F) // ASCII (non control) - || ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB - } - @Override public String toString() { StringBuilder rtn = new StringBuilder(); diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Compress.java b/src/java/com/healthmarketscience/jackcess/scsu/Compress.java new file mode 100644 index 0000000..c5f7360 --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/Compress.java @@ -0,0 +1,628 @@ +package com.healthmarketscience.jackcess.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + This class implements a simple compression algorithm + **/ +/* + Note on exception handling + This compressor is designed so that it can be restarted after + an exception. All operations advancing input and/or output cursor + (iIn and iOut) either complete an action, or set a state (fUnicodeMode) + before updating the cursors. +*/ +public class Compress extends SCSU +{ + + /** next input character to be read **/ + private int iIn; + + /** next output byte to be written **/ + private int iOut; + + /** start index of Unicode mode in output array, or -1 if in single byte mode **/ + private int iSCU = -1; + + /** true if the next command byte is of the Uxx family */ + private boolean fUnicodeMode = false; + + /** locate a window for a character given a table of offsets + @param ch - character + @param offsetTable - table of window offsets + @return true if the character fits a window from the table of windows */ + private boolean locateWindow(int ch, int[] offsetTable) + { + // always try the current window first + int iWin = getCurrentWindow(); + + // if the character fits the current window + // just use the current window + if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) + { + return true; + } + + // try all windows in order + for (iWin = 0; iWin < offsetTable.length; iWin++) + { + if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) + { + selectWindow(iWin); + return true; + } + } + // none found + return false; + } + + /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */ + public static boolean isAsciiCrLfOrTab(int ch) + { + return (ch >= 0x20 && ch <= 0x7F) // ASCII + || ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB + + } + + /** output a run of characters in single byte mode + In single byte mode pass through characters in the ASCII range, but + quote characters overlapping with compression command codes. Runs + of characters fitting the current window are output as runs of bytes + in the range 0x80-0xFF. Checks for and validates Surrogate Pairs. + Uses and updates the current input and output cursors store in + the instance variables iIn and iOut. + @param in - input character array + @param out - output byte array + @return the next chaacter to be processed. This may be an extended character. + **/ + @SuppressWarnings("fallthrough") + public int outputSingleByteRun(char [] in, byte [] out) + throws EndOfOutputException, EndOfInputException, IllegalInputException + { + int iWin = getCurrentWindow(); + while(iIn < in.length) + { + int outlen = 0; + byte byte1 = 0; + byte byte2 = 0; + + // get the input character + int ch = in[iIn]; + + int inlen = 1; + + // Check input for Surrogate pair + if ( (ch & 0xF800) == 0xD800 ) + { + if ( (ch & 0xFC00) == 0xDC00 ) + { + // low surrogate out of order + throw new IllegalInputException("Unpaired low surrogate: "+iIn); + } + else + { + // have high surrogate now get low surrogate + if ( iIn >= in.length-1) + { + // premature end of input + throw new EndOfInputException(); + } + // get the char + int ch2 = in[iIn+1]; + + // make sure it's a low surrogate + if ( (ch2 & 0xFC00) != 0xDC00 ) + { + // a low surrogate was required + throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1)); + } + + // combine the two values + ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; + // ch = ch<<10 + ch2 - 0x36F0000; + + inlen = 2; + } + } + + // ASCII Letter, NUL, CR, LF and TAB are always passed through + if (isAsciiCrLfOrTab(ch) || ch == 0) + { + // pass through directcly + byte2 = (byte)(ch & 0x7F); + outlen = 1; + } + + // All other control codes must be quoted + else if (ch < 0x20) + { + byte1 = SQ0; + byte2 = (byte)(ch); + outlen = 2; + } + + // Letters that fit the current dynamic window + else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) + { + ch -= dynamicOffset[iWin]; + byte2 = (byte)(ch | 0x80); + outlen = 1; + } + + // check for room in the output array + if (iOut + outlen >= out.length) + { + throw new EndOfOutputException(); + } + + switch(outlen) + { + default: + // need to use some other compression mode for this + // character so we terminate this loop + + return ch; // input not finished + + // output the characters + case 2: + out[iOut++] = byte1; + // fall through + case 1: + out[iOut++] = byte2; + break; + } + // advance input pointer + iIn += inlen; + } + return 0; // input all used up + } + + /** quote a single character in single byte mode + Quoting a character (aka 'non-locking shift') gives efficient access + to characters that occur in isolation--usually punctuation characters. + When quoting a character from a dynamic window use 0x80 - 0xFF, when + quoting a character from a static window use 0x00-0x7f. + @param ch - character to be quoted + @param out - output byte array + **/ + + private void quoteSingleByte(int ch, byte [] out) + throws EndOfOutputException + { + Debug.out("Quoting SingleByte ", ch); + int iWin = getCurrentWindow(); + + // check for room in the output array + if (iOut >= out.length -2) + { + throw new EndOfOutputException(); + } + + // Output command byte followed by + out[iOut++] = (byte)(SQ0 + iWin); + + // Letter that fits the current dynamic window + if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) + { + ch -= dynamicOffset[iWin]; + out[iOut++] = (byte)(ch | 0x80); + } + + // Letter that fits the current static window + else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) + { + ch -= staticOffset[iWin]; + out[iOut++] = (byte)ch; + } + else + { + throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error"); + } + // advance input pointer + iIn ++; + Debug.out("New input: ", iIn); + } + + /** output a run of characters in Unicode mode + A run of Unicode mode consists of characters which are all in the + range of non-compressible characters or isolated occurrence + of any other characters. Characters in the range 0xE00-0xF2FF must + be quoted to avoid overlap with the Unicode mode compression command codes. + Uses and updates the current input and output cursors store in + the instance variables iIn and iOut. + NOTE: Characters from surrogate pairs are passed through and unlike single + byte mode no checks are made for unpaired surrogate characters. + @param in - input character array + @param out - output byte array + @return the next input character to be processed + **/ + public char outputUnicodeRun(char [] in, byte [] out) + throws EndOfOutputException + { + // current character + char ch = 0; + + while(iIn < in.length) + { + // get current input and set default output length + ch = in[iIn]; + int outlen = 2; + + // Characters in these ranges could potentially be compressed. + // We require 2 or more compressible characters to break the run + if (isCompressible(ch)) + { + // check whether we can look ahead + if( iIn < in.length - 1) + { + // DEBUG + Debug.out("is-comp: ",ch); + char ch2 = in[iIn + 1]; + if (isCompressible(ch2)) + { + // at least 2 characters are compressible + // break the run + break; + } + //DEBUG + Debug.out("no-comp: ",ch2); + } + // If we get here, the current character is only character + // left in the input or it is followed by a non-compressible + // character. In neither case do we gain by breaking the + // run, so we proceed to output the character. + if (ch >= 0xE000 && ch <= 0xF2FF) + { + // Characters in this range need to be escaped + outlen = 3; + } + + } + // check that there is enough room to output the character + if(iOut >= out.length - outlen) + { + // DEBUG + Debug.out("End of Output @", iOut); + // if we got here, we ran out of space in the output array + throw new EndOfOutputException(); + } + + // output any characters that cannot be compressed, + if (outlen == 3) + { + // output the quote character + out[iOut++] = UQU; + } + // pass the Unicode character in MSB,LSB order + out[iOut++] = (byte)(ch >>> 8); + out[iOut++] = (byte)(ch & 0xFF); + + // advance input cursor + iIn++; + } + + // return the last character + return ch; + } + + static int iNextWindow = 3; + + /** redefine a window so it surrounds a given character value + For now, this function uses window 3 exclusively (window 4 + for extended windows); + @return true if a window was successfully defined + @param ch - character around which window is positioned + @param out - output byte array + @param fCurUnicodeMode - type of window + **/ + private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode) + throws IllegalInputException, EndOfOutputException + { + int iWin = iNextWindow % 8; // simple LRU + int iPosition = 0; + + // iPosition 0 is a reserved value + if (ch < 0x80) + { + throw new IllegalStateException("ch < 0x80"); + //return false; + } + + // Check the fixed offsets + for (int i = 0; i < fixedOffset.length; i++) + { + if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) + { + iPosition = i; + break; + } + } + + if (iPosition != 0) + { + // DEBUG + Debug.out("FIXED position is ", iPosition + 0xF9); + + // ch fits in a fixed offset window position + dynamicOffset[iWin] = fixedOffset[iPosition]; + iPosition += 0xF9; + } + else if (ch < 0x3400) + { + // calculate a window position command and set the offset + iPosition = ch >>> 7; + dynamicOffset[iWin] = ch & 0xFF80; + + Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); + } + else if (ch < 0xE000) + { + // attempt to place a window where none can go + return false; + } + else if (ch <= 0xFFFF) + { + // calculate a window position command, accounting + // for the gap in position values, and set the offset + iPosition = ((ch - gapOffset)>>> 7); + + dynamicOffset[iWin] = ch & 0xFF80; + + Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); + } + else + { + // if we get here, the character is in the extended range. + // Always use Window 4 to define an extended window + + iPosition = (ch - 0x10000) >>> 7; + // DEBUG + Debug.out("Try position Window at ", iPosition); + + iPosition |= iWin << 13; + dynamicOffset[iWin] = ch & 0x1FFF80; + } + + // Outputting window defintion command for the general cases + if ( iPosition < 0x100 && iOut < out.length-1) + { + out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin); + out[iOut++] = (byte) (iPosition & 0xFF); + } + // Output an extended window definiton command + else if ( iPosition >= 0x100 && iOut < out.length - 2) + { + + Debug.out("Setting extended window at ", iPosition); + out[iOut++] = (fCurUnicodeMode ? UDX : SDX); + out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF); + out[iOut++] = (byte) (iPosition & 0xFF); + } + else + { + throw new EndOfOutputException(); + } + selectWindow(iWin); + iNextWindow++; + return true; + } + + /** + compress a Unicode character array with some simplifying assumptions + **/ + public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut) + throws IllegalInputException, EndOfInputException, EndOfOutputException + { + iIn = iStartIn; + iOut = iStartOut; + + + while (iIn < in.length) + { + int ch; + + // previously we switched to a Unicode run + if (iSCU != -1) + { + + Debug.out("Remaining", in, iIn); + Debug.out("Output until ["+iOut+"]: ", out); + + // output characters as Unicode + ch = outputUnicodeRun(in, out); + + // for single character Unicode runs (3 bytes) use quote + if (iOut - iSCU == 3 ) + { + // go back and fix up the SCU to an SQU instead + out[iSCU] = SQU; + iSCU = -1; + continue; + } + else + { + iSCU = -1; + fUnicodeMode = true; + } + } + // next, try to output characters as single byte run + else + { + ch = outputSingleByteRun(in, out); + } + + // check whether we still have input + if (iIn == in.length) + { + break; // no more input + } + + // if we get here, we have a consistent value for ch, whether or + // not it is an regular or extended character. Locate or define a + // Window for the current character + + Debug.out("Output so far: ", out); + Debug.out("Routing ch="+ch+" for Input", in, iIn); + + // Check that we have enough room to output the command byte + if (iOut >= out.length - 1) + { + throw new EndOfOutputException(); + } + + // In order to switch away from Unicode mode, it is necessary + // to select (or define) a window. If the characters that follow + // the Unicode range are ASCII characters, we can't use them + // to decide which window to select, since ASCII characters don't + // influence window settings. This loop looks ahead until it finds + // one compressible character that isn't in the ASCII range. + for (int ich = iIn; ch < 0x80; ich++) + { + if (ich == in.length || !isCompressible(in[ich])) + { + // if there are only ASCII characters left, + ch = in[iIn]; + break; + } + ch = in[ich]; // lookahead for next non-ASCII char + } + // The character value contained in ch here will only be used to select + // output modes. Actual output of characters starts with in[iIn] and + // only takes place near the top of the loop. + + int iprevWindow = getCurrentWindow(); + + // try to locate a dynamic window + if (ch < 0x80 || locateWindow(ch, dynamicOffset)) + { + Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1); + // lookahead to use SQn instead of SCn for single + // character interruptions of runs in current window + if(!fUnicodeMode && iIn < in.length -1) + { + char ch2 = in[iIn+1]; + if (ch2 >= dynamicOffset[iprevWindow] && + ch2 < dynamicOffset[iprevWindow] + 0x80) + { + quoteSingleByte(ch, out); + selectWindow(iprevWindow); + continue; + } + } + + out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow()); + fUnicodeMode = false; + } + // try to locate a static window + else if (!fUnicodeMode && locateWindow(ch, staticOffset)) + { + // static windows are not accessible from Unicode mode + Debug.out("located a static window", getCurrentWindow()); + quoteSingleByte(ch, out); + selectWindow(iprevWindow); // restore current Window settings + continue; + } + // try to define a window around ch + else if (positionWindow(ch, out, fUnicodeMode) ) + { + fUnicodeMode = false; + } + // If all else fails, start a Unicode run + else + { + iSCU = iOut; + out[iOut++] = SCU; + continue; + } + } + + return iOut - iStartOut; + } + + public byte[] compress(String inStr) + throws IllegalInputException, EndOfInputException + { + // Running out of room for output can cause non-optimal + // compression. In order to not slow down compression too + // much, not all intermediate state is constantly saved. + + byte [] out = new byte[inStr.length() * 2]; + char [] in = inStr.toCharArray(); + //DEBUG + Debug.out("compress input: ",in); + reset(); + while(true) + { + try + { + simpleCompress(in, charsRead(), out, bytesWritten()); + // if we get here things went fine. + break; + } + catch (EndOfOutputException e) + { + // create a larger output buffer and continue + byte [] largerOut = new byte[out.length * 2]; + System.arraycopy(out, 0, largerOut, 0, out.length); + out = largerOut; + } + } + byte [] trimmedOut = new byte[bytesWritten()]; + System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length); + out = trimmedOut; + + Debug.out("compress output: ", out); + return out; + } + + /** reset is only needed to bail out after an exception and + restart with new input */ + @Override + public void reset() + { + super.reset(); + fUnicodeMode = false; + iSCU = - 1; + } + + /** returns the number of bytes written **/ + public int bytesWritten() + { + return iOut; + } + + /** returns the number of bytes written **/ + public int charsRead() + { + return iIn; + } + +} diff --git a/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java b/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java new file mode 100644 index 0000000..501d195 --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java @@ -0,0 +1,48 @@ +package com.healthmarketscience.jackcess.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ +/** + * The input string or input byte array ended prematurely + */ +public class EndOfOutputException + extends java.lang.Exception + +{ + + private static final long serialVersionUID = 1L; + + public EndOfOutputException(){ + super("The input string or input byte array ended prematurely"); + } + + public EndOfOutputException(String s) { + super(s); + } +} diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java index 9605554..4858044 100644 --- a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java +++ b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java @@ -411,6 +411,7 @@ public class Expand extends SCSU /** reset is called to start with new input, w/o creating a new instance */ + @Override public void reset() { iOut = 0; diff --git a/test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java new file mode 100644 index 0000000..af3063d --- /dev/null +++ b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java @@ -0,0 +1,574 @@ +package com.healthmarketscience.jackcess.scsu; + +import java.io.*; +import java.util.*; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * @version 005 Sep 30 1998 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + Class CompressMain + + A small commandline driver interface for the compression routines + Use the /? to get usage +*/ +public class CompressMain +{ + static void usage() + { + System.err.println("java CompressMain /? : this usage information\n"); + System.err.println("java CompressMain /random : random test\n"); + System.err.println("java CompressMain /suite : suite test\n"); + System.err.println("java CompressMain /suite : file test (file data may include \\uXXXX)\n"); + System.err.println("java CompressMain : string test (string may include \\uXXXX)\n"); + System.err.println("java CompressMain /roundtrip : check Unicode file for roundtrip\n"); + System.err.println("java CompressMain /compress : compresses Unicode files (no \\uXXXX)\n"); + System.err.println("java CompressMain /expand : expands into Unicode files\n"); + System.err.println("java CompressMain /byteswap : swaps byte order of Unicode files\n"); + System.err.println("java CompressMain /display : like expand, but creates a dump instead\n"); + System.err.println("java CompressMain /parse : parses \\uXXXX into binary Unicode\n"); + } + + static void analyze(String text, int inlength, String result, int outlength) + { + boolean fSuccess = text.equals(result); + Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED"); + if (!fSuccess && result != null) + { + int iLim = Math.min(text.length(), result.length()); + for (int i = 0; i < iLim; i++) + { + if (text.charAt(i) != result.charAt(i)) + { + Debug.out("First Mismatch at "+ i +"=", result.charAt(i) ); + Debug.out("Original character "+ i +"=", text.charAt(i) ); + break; + } + } + } + else + { + Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes."); + Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%."); + } + } + + static void test2(String text) + { + byte bytes[] = null; + String result = null; + Debug.out("SCSU:\n"); + Compress compressor = new Compress(); + try + { + bytes = compressor.compress(text); + Expand display = new Expand(); + result = display.expand(bytes); + Debug.out("Input: ", text.toCharArray()); + Debug.out("Result: ", result.toCharArray()); + Debug.out(""); + Expand expander = new Expand(); + result = expander.expand(bytes); + } + catch (Exception e) + { + System.out.println(e); + } + int inlength = compressor.charsRead(); + int outlength = compressor.bytesWritten(); + analyze(text, inlength, result, outlength); + } + + static void test(String text) throws Exception + { + test(text, false); + } + + static void test(String text, boolean shouldFail) + throws Exception + { + // Create an instance of the compressor + Compress compressor = new Compress(); + + byte [] bytes = null; + String result = null; + Exception failure = null; + try { + // perform compression + bytes = compressor.compress(text); + } + catch(Exception e) + { + failure = e; + } + + if(shouldFail) { + if(failure == null) { + throw new RuntimeException("Did not fail"); + } + return; + } + + if(failure != null) { + throw failure; + } + + Expand expander = new Expand(); + // perform expansion + result = expander.expand(bytes); + + // analyze the results + int inlength = compressor.charsRead(); + int outlength = compressor.bytesWritten(); + analyze(text, inlength, result, outlength); + + } + + public static void display(byte [] input) + { + try + { + Expand expand = new Expand(); + String text = expand.expand(input); + Debug.out(text.toCharArray()); + } + catch (Exception e) + { + System.out.println(e); + } + } + + public static String parse(String input) + { + StringTokenizer st = new StringTokenizer(input, "\\", true); + Debug.out("Input: ", input); + + StringBuffer sb = new StringBuffer(); + + while(st.hasMoreTokens()) + { + String token = st.nextToken(); + Debug.out("Token: ", token); + if (token.charAt(0) == '\\' && token.length() == 1) + { + if(st.hasMoreTokens()) + { + token = st.nextToken(); + } + if(token.charAt(0) == 'u') + { + Debug.out("Token: "+ token+ " ", sb.toString()); + String hexnum; + if (token.length() > 5) + { + hexnum = token.substring(1,5); + token = token.substring(5); + } + else + { + hexnum = token.substring(1); + token = ""; + } + sb.append((char)Integer.parseInt(hexnum, 16)); + } + } + sb.append(token); + } + return sb.toString(); + } + + public static void randomTest(int nTest) + throws Exception + { + Random random = new Random(); + + for(int n=0; n < nTest; n++) + { + int iLen = (int) (20 * random.nextFloat()); + StringBuffer sb = new StringBuffer(iLen); + + for(int i = 0; i < iLen; i++) + { + sb.append((char) (0xFFFF * random.nextFloat())); + } + + test(sb.toString()); + } + } + + @SuppressWarnings("deprecation") + public static void fileTest(String name) + throws Exception + { + DataInputStream dis = new DataInputStream(new FileInputStream(name)); + + int iLine = 0; + + while(dis.available() != 0) + { + String line = dis.readLine(); + Debug.out("Line "+ iLine++ +" "+line); + test(parse(line), false ); //false);// initially no debug info + } + } + + public static void displayFile(String name) + throws IOException + { + DataInputStream dis = new DataInputStream(new FileInputStream(name)); + + byte bytes[] = new byte[dis.available()]; + dis.read(bytes); + display(bytes); + } + + public static void decodeTest(String name) + throws IOException + { + DataInputStream dis = new DataInputStream(new FileInputStream(name)); + + byte bytes[] = new byte[dis.available()]; + dis.read(bytes); + + Expand expand = new Expand(); + + char [] chars = null; + try + { + String text = expand.expand(bytes); + chars = text.toCharArray(); + } + catch (Exception e) + { + System.out.println(e); + } + int inlength = expand.bytesRead(); + int iDot = name.lastIndexOf('.'); + StringBuffer sb = new StringBuffer(name); + sb.setLength(iDot + 1); + sb.append("txt"); + String outName = sb.toString(); + + int outlength = expand.charsWritten(); + + Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%."); + + if (chars == null) + return; + + writeUnicodeFile(outName, chars); + } + + /** most of the next 3 functions should not be needed by JDK11 and later */ + private static int iMSB = 1; + + public static String readUnicodeFile(String name) + { + try + { + FileInputStream dis = new FileInputStream(name); + + byte b[] = new byte[2]; + StringBuffer sb = new StringBuffer(); + char ch = 0; + + iMSB = 1; + int i = 0; + for(i = 0; (dis.available() != 0); i++) + { + b[i%2] = (byte) dis.read(); + + if ((i & 1) == 1) + { + ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]); + } + else + { + continue; + } + if (i == 1 && ch == '\uFEFF') + continue; // throw away byte order mark + + if (i == 1 && ch == '\uFFFE') + { + iMSB ++; // flip byte order + continue; // throw away byte order mark + } + sb.append(ch); + } + + return sb.toString(); + } + catch (IOException e) + { + System.err.println(e); + return ""; + } + } + + public static void writeUnicodeFile(String outName, char [] chars) + throws IOException + { + DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName)); + if ((iMSB & 1) == 1) + { + dos.writeByte(0xFF); + dos.writeByte(0xFE); + } + else + { + dos.writeByte(0xFE); + dos.writeByte(0xFF); + } + byte b[] = new byte[2]; + for (int ich = 0; ich < chars.length; ich++) + { + b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8); + b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF); + dos.write(b, 0, 2); + } + } + + static void byteswap(String name) + throws IOException + { + String text = readUnicodeFile(name); + char chars[] = text.toCharArray(); + writeUnicodeFile(name, chars); + } + + @SuppressWarnings("deprecation") + public static void parseFile(String name) + throws IOException + { + DataInputStream dis = new DataInputStream(new FileInputStream(name)); + + byte bytes[] = new byte[dis.available()]; + dis.read(bytes); + + // simplistic test + int bom = (char) bytes[0] + (char) bytes[1]; + if (bom == 131069) + { + // FEFF or FFFE detected (either one sums to 131069) + Debug.out(name + " is already in Unicode!"); + return; + } + + // definitely assumes an ASCII file at this point + String text = new String(bytes, 0); + + char chars[] = parse(text).toCharArray(); + writeUnicodeFile(name, chars); + return; + } + + public static void encodeTest(String name) + throws Exception + { + String text = readUnicodeFile(name); + + // Create an instance of the compressor + Compress compressor = new Compress(); + + byte [] bytes = null; + + // perform compression + bytes = compressor.compress(text); + + int inlength = compressor.charsRead(); + int iDot = name.lastIndexOf('.'); + StringBuffer sb = new StringBuffer(name); + sb.setLength(iDot + 1); + sb.append("csu"); + String outName = sb.toString(); + + DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName)); + dos.write(bytes, 0, bytes.length); + + int outlength = compressor.bytesWritten(); + + Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%."); + } + + public static void roundtripTest(String name) + throws Exception + { + test(readUnicodeFile(name), false);// no debug info + } + + /** The Main function */ + public static void main(String args[]) + throws Exception + { + int iArg = args.length; + + try + { + if (iArg != 0) + { + if (args[0].equalsIgnoreCase("/compress")) + { + while (--iArg > 0) + { + encodeTest(args[args.length - iArg]); + } + } + else if (args[0].equalsIgnoreCase("/parse")) + { + while (--iArg > 0) + { + parseFile(args[args.length - iArg]); + } + } + else if (args[0].equalsIgnoreCase("/expand")) + { + while (--iArg > 0) + { + decodeTest(args[args.length - iArg]); + } + } + else if (args[0].equalsIgnoreCase("/display")) + { + while (--iArg > 0) + { + displayFile(args[args.length - iArg]); + } + } + else if (args[0].equalsIgnoreCase("/roundtrip")) + { + while (--iArg > 0) + { + roundtripTest(args[args.length - iArg]); + } + } + else if (args[0].equalsIgnoreCase("/byteswap")) + { + while (--iArg > 0) + { + byteswap(args[args.length - iArg]); + } + }else if (args[0].equalsIgnoreCase("/random")) + { + randomTest(8); + } + else if (args[0].equalsIgnoreCase("/suite")) + { + if (iArg == 1) + { + suiteTest(); + } + else + { + while (--iArg > 0) + { + fileTest(args[args.length - iArg]); + } + } + } + else if (args[0].equalsIgnoreCase("/?")) + { + usage(); + } + else + { + while (iArg > 0) + { + test2(parse(args[--iArg])); + } + } + } + else + { + usage(); + } + } + catch (IOException e) + { + System.err.println(e); + } + try + { + System.err.println("Done. Press enter to exit"); + System.in.read(); + } + catch (IOException e) + { + + } + } + + static void suiteTest() + throws Exception + { + Debug.out("Standard Compression test suite:"); + test("Hello \u9292 \u9192 World!"); + test("Hell\u0429o \u9292 \u9192 W\u00e4rld!"); + test("Hell\u0429o \u9292 \u9292W\u00e4rld!"); + + test("\u0648\u06c8"); // catch missing reset + test("\u0648\u06c8"); + + test("\u4444\uE001"); // lowest quotable + test("\u4444\uf2FF"); // highest quotable + test("\u4444\uf188\u4444"); + test("\u4444\uf188\uf288"); + test("\u4444\uf188abc\0429\uf288"); + test("\u9292\u2222"); + test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!"); + test("Hell\u0429o \u9292 \u9292W\u00e4rld!"); + test("Hello World!123456"); + test("Hello W\u0081\u011f\u0082!"); // Latin 1 run + + test("abc\u0301\u0302"); // uses SQn for u301 u302 + test("abc\u4411d"); // uses SQU + test("abc\u4411\u4412d");// uses SCU + test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5 + test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data + test("\u9292\u2222"); + test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); + test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c"); + test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002"); + + test(""); // empty input + test("\u0000"); // smallest BMP character + test("\uFFFF"); // largest BMP character + + test("\ud800\udc00"); // smallest surrogate + test("\ud8ff\udcff"); // largest surrogate pair + + + Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:"); + test("\ud800 \udc00", true); // unpaired surrogate (1) + test("\udc00", true); // unpaired surrogate (2) + test("\ud800", true); // unpaired surrogate (3) + } +} diff --git a/test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java new file mode 100644 index 0000000..0f17e6c --- /dev/null +++ b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java @@ -0,0 +1,47 @@ +/* +Copyright (c) 2007 Health Market Science, Inc. + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +USA + +You can contact Health Market Science at info@healthmarketscience.com +or at the following address: + +Health Market Science +2700 Horizon Drive +Suite 200 +King of Prussia, PA 19406 +*/ + +package com.healthmarketscience.jackcess.scsu; + +import junit.framework.TestCase; + +/** + * @author James Ahlborn + */ +public class CompressTest extends TestCase +{ + + public CompressTest(String name) throws Exception { + super(name); + } + + public void testCompression() throws Exception + { + CompressMain.suiteTest(); + } + +} -- 2.39.5