tests. git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@365 f203690c-595d-4dc9-a70b-905162fa7fd2tags/rel_1_1_16
@@ -25,6 +25,10 @@ | |||
<action dev="jahlborn" type="add"> | |||
Add primitive support for writing unicode compressed text columns. | |||
</action> | |||
<action dev="jahlborn" type="add"> | |||
Add compression code for possible future use; add compression unit | |||
tests. | |||
</action> | |||
</release> | |||
<release version="1.1.15" date="2008-06-27"> | |||
<action dev="jahlborn" type="fix" issue="1998225"> |
@@ -41,6 +41,7 @@ import java.util.List; | |||
import java.util.regex.Matcher; | |||
import java.util.regex.Pattern; | |||
import com.healthmarketscience.jackcess.scsu.Compress; | |||
import com.healthmarketscience.jackcess.scsu.EndOfInputException; | |||
import com.healthmarketscience.jackcess.scsu.Expand; | |||
import com.healthmarketscience.jackcess.scsu.IllegalInputException; | |||
@@ -1196,23 +1197,13 @@ public class Column implements Comparable<Column> { | |||
// now, see if it is all printable ASCII | |||
for(int i = 0; i < text.length(); ++i) { | |||
char c = text.charAt(i); | |||
if(!isAsciiCrLfOrTab(c)) { | |||
if(!Compress.isAsciiCrLfOrTab(c)) { | |||
return false; | |||
} | |||
} | |||
return true; | |||
} | |||
/** | |||
* Returns true if the character is ASCII, but not a control other than | |||
* CR, LF and TAB | |||
*/ | |||
private static boolean isAsciiCrLfOrTab(int ch) | |||
{ | |||
return ((ch >= 0x20 && ch <= 0x7F) // ASCII (non control) | |||
|| ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB | |||
} | |||
@Override | |||
public String toString() { | |||
StringBuilder rtn = new StringBuilder(); |
@@ -0,0 +1,628 @@ | |||
package com.healthmarketscience.jackcess.scsu; | |||
/** | |||
* This sample software accompanies Unicode Technical Report #6 and | |||
* distributed as is by Unicode, Inc., subject to the following: | |||
* | |||
* Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. | |||
* | |||
* Permission to use, copy, modify, and distribute this software | |||
* without fee is hereby granted provided that this copyright notice | |||
* appears in all copies. | |||
* | |||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE | |||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING | |||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, | |||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. | |||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND | |||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND | |||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING | |||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. | |||
* | |||
* @author Asmus Freytag | |||
* | |||
* @version 001 Dec 25 1996 | |||
* @version 002 Jun 25 1997 | |||
* @version 003 Jul 25 1997 | |||
* @version 004 Aug 25 1997 | |||
* | |||
* Unicode and the Unicode logo are trademarks of Unicode, Inc., | |||
* and are registered in some jurisdictions. | |||
**/ | |||
/** | |||
This class implements a simple compression algorithm | |||
**/ | |||
/* | |||
Note on exception handling | |||
This compressor is designed so that it can be restarted after | |||
an exception. All operations advancing input and/or output cursor | |||
(iIn and iOut) either complete an action, or set a state (fUnicodeMode) | |||
before updating the cursors. | |||
*/ | |||
public class Compress extends SCSU | |||
{ | |||
/** next input character to be read **/ | |||
private int iIn; | |||
/** next output byte to be written **/ | |||
private int iOut; | |||
/** start index of Unicode mode in output array, or -1 if in single byte mode **/ | |||
private int iSCU = -1; | |||
/** true if the next command byte is of the Uxx family */ | |||
private boolean fUnicodeMode = false; | |||
/** locate a window for a character given a table of offsets | |||
@param ch - character | |||
@param offsetTable - table of window offsets | |||
@return true if the character fits a window from the table of windows */ | |||
private boolean locateWindow(int ch, int[] offsetTable) | |||
{ | |||
// always try the current window first | |||
int iWin = getCurrentWindow(); | |||
// if the character fits the current window | |||
// just use the current window | |||
if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) | |||
{ | |||
return true; | |||
} | |||
// try all windows in order | |||
for (iWin = 0; iWin < offsetTable.length; iWin++) | |||
{ | |||
if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) | |||
{ | |||
selectWindow(iWin); | |||
return true; | |||
} | |||
} | |||
// none found | |||
return false; | |||
} | |||
/** returns true if the character is ASCII, but not a control other than CR, LF and TAB */ | |||
public static boolean isAsciiCrLfOrTab(int ch) | |||
{ | |||
return (ch >= 0x20 && ch <= 0x7F) // ASCII | |||
|| ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB | |||
} | |||
/** output a run of characters in single byte mode | |||
In single byte mode pass through characters in the ASCII range, but | |||
quote characters overlapping with compression command codes. Runs | |||
of characters fitting the current window are output as runs of bytes | |||
in the range 0x80-0xFF. Checks for and validates Surrogate Pairs. | |||
Uses and updates the current input and output cursors store in | |||
the instance variables <i>iIn</i> and <i>iOut</i>. | |||
@param in - input character array | |||
@param out - output byte array | |||
@return the next chaacter to be processed. This may be an extended character. | |||
**/ | |||
@SuppressWarnings("fallthrough") | |||
public int outputSingleByteRun(char [] in, byte [] out) | |||
throws EndOfOutputException, EndOfInputException, IllegalInputException | |||
{ | |||
int iWin = getCurrentWindow(); | |||
while(iIn < in.length) | |||
{ | |||
int outlen = 0; | |||
byte byte1 = 0; | |||
byte byte2 = 0; | |||
// get the input character | |||
int ch = in[iIn]; | |||
int inlen = 1; | |||
// Check input for Surrogate pair | |||
if ( (ch & 0xF800) == 0xD800 ) | |||
{ | |||
if ( (ch & 0xFC00) == 0xDC00 ) | |||
{ | |||
// low surrogate out of order | |||
throw new IllegalInputException("Unpaired low surrogate: "+iIn); | |||
} | |||
else | |||
{ | |||
// have high surrogate now get low surrogate | |||
if ( iIn >= in.length-1) | |||
{ | |||
// premature end of input | |||
throw new EndOfInputException(); | |||
} | |||
// get the char | |||
int ch2 = in[iIn+1]; | |||
// make sure it's a low surrogate | |||
if ( (ch2 & 0xFC00) != 0xDC00 ) | |||
{ | |||
// a low surrogate was required | |||
throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1)); | |||
} | |||
// combine the two values | |||
ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; | |||
// ch = ch<<10 + ch2 - 0x36F0000; | |||
inlen = 2; | |||
} | |||
} | |||
// ASCII Letter, NUL, CR, LF and TAB are always passed through | |||
if (isAsciiCrLfOrTab(ch) || ch == 0) | |||
{ | |||
// pass through directcly | |||
byte2 = (byte)(ch & 0x7F); | |||
outlen = 1; | |||
} | |||
// All other control codes must be quoted | |||
else if (ch < 0x20) | |||
{ | |||
byte1 = SQ0; | |||
byte2 = (byte)(ch); | |||
outlen = 2; | |||
} | |||
// Letters that fit the current dynamic window | |||
else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) | |||
{ | |||
ch -= dynamicOffset[iWin]; | |||
byte2 = (byte)(ch | 0x80); | |||
outlen = 1; | |||
} | |||
// check for room in the output array | |||
if (iOut + outlen >= out.length) | |||
{ | |||
throw new EndOfOutputException(); | |||
} | |||
switch(outlen) | |||
{ | |||
default: | |||
// need to use some other compression mode for this | |||
// character so we terminate this loop | |||
return ch; // input not finished | |||
// output the characters | |||
case 2: | |||
out[iOut++] = byte1; | |||
// fall through | |||
case 1: | |||
out[iOut++] = byte2; | |||
break; | |||
} | |||
// advance input pointer | |||
iIn += inlen; | |||
} | |||
return 0; // input all used up | |||
} | |||
/** quote a single character in single byte mode | |||
Quoting a character (aka 'non-locking shift') gives efficient access | |||
to characters that occur in isolation--usually punctuation characters. | |||
When quoting a character from a dynamic window use 0x80 - 0xFF, when | |||
quoting a character from a static window use 0x00-0x7f. | |||
@param ch - character to be quoted | |||
@param out - output byte array | |||
**/ | |||
private void quoteSingleByte(int ch, byte [] out) | |||
throws EndOfOutputException | |||
{ | |||
Debug.out("Quoting SingleByte ", ch); | |||
int iWin = getCurrentWindow(); | |||
// check for room in the output array | |||
if (iOut >= out.length -2) | |||
{ | |||
throw new EndOfOutputException(); | |||
} | |||
// Output command byte followed by | |||
out[iOut++] = (byte)(SQ0 + iWin); | |||
// Letter that fits the current dynamic window | |||
if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) | |||
{ | |||
ch -= dynamicOffset[iWin]; | |||
out[iOut++] = (byte)(ch | 0x80); | |||
} | |||
// Letter that fits the current static window | |||
else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) | |||
{ | |||
ch -= staticOffset[iWin]; | |||
out[iOut++] = (byte)ch; | |||
} | |||
else | |||
{ | |||
throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error"); | |||
} | |||
// advance input pointer | |||
iIn ++; | |||
Debug.out("New input: ", iIn); | |||
} | |||
/** output a run of characters in Unicode mode | |||
A run of Unicode mode consists of characters which are all in the | |||
range of non-compressible characters or isolated occurrence | |||
of any other characters. Characters in the range 0xE00-0xF2FF must | |||
be quoted to avoid overlap with the Unicode mode compression command codes. | |||
Uses and updates the current input and output cursors store in | |||
the instance variables <i>iIn</i> and <i>iOut</i>. | |||
NOTE: Characters from surrogate pairs are passed through and unlike single | |||
byte mode no checks are made for unpaired surrogate characters. | |||
@param in - input character array | |||
@param out - output byte array | |||
@return the next input character to be processed | |||
**/ | |||
public char outputUnicodeRun(char [] in, byte [] out) | |||
throws EndOfOutputException | |||
{ | |||
// current character | |||
char ch = 0; | |||
while(iIn < in.length) | |||
{ | |||
// get current input and set default output length | |||
ch = in[iIn]; | |||
int outlen = 2; | |||
// Characters in these ranges could potentially be compressed. | |||
// We require 2 or more compressible characters to break the run | |||
if (isCompressible(ch)) | |||
{ | |||
// check whether we can look ahead | |||
if( iIn < in.length - 1) | |||
{ | |||
// DEBUG | |||
Debug.out("is-comp: ",ch); | |||
char ch2 = in[iIn + 1]; | |||
if (isCompressible(ch2)) | |||
{ | |||
// at least 2 characters are compressible | |||
// break the run | |||
break; | |||
} | |||
//DEBUG | |||
Debug.out("no-comp: ",ch2); | |||
} | |||
// If we get here, the current character is only character | |||
// left in the input or it is followed by a non-compressible | |||
// character. In neither case do we gain by breaking the | |||
// run, so we proceed to output the character. | |||
if (ch >= 0xE000 && ch <= 0xF2FF) | |||
{ | |||
// Characters in this range need to be escaped | |||
outlen = 3; | |||
} | |||
} | |||
// check that there is enough room to output the character | |||
if(iOut >= out.length - outlen) | |||
{ | |||
// DEBUG | |||
Debug.out("End of Output @", iOut); | |||
// if we got here, we ran out of space in the output array | |||
throw new EndOfOutputException(); | |||
} | |||
// output any characters that cannot be compressed, | |||
if (outlen == 3) | |||
{ | |||
// output the quote character | |||
out[iOut++] = UQU; | |||
} | |||
// pass the Unicode character in MSB,LSB order | |||
out[iOut++] = (byte)(ch >>> 8); | |||
out[iOut++] = (byte)(ch & 0xFF); | |||
// advance input cursor | |||
iIn++; | |||
} | |||
// return the last character | |||
return ch; | |||
} | |||
static int iNextWindow = 3; | |||
/** redefine a window so it surrounds a given character value | |||
For now, this function uses window 3 exclusively (window 4 | |||
for extended windows); | |||
@return true if a window was successfully defined | |||
@param ch - character around which window is positioned | |||
@param out - output byte array | |||
@param fCurUnicodeMode - type of window | |||
**/ | |||
private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode) | |||
throws IllegalInputException, EndOfOutputException | |||
{ | |||
int iWin = iNextWindow % 8; // simple LRU | |||
int iPosition = 0; | |||
// iPosition 0 is a reserved value | |||
if (ch < 0x80) | |||
{ | |||
throw new IllegalStateException("ch < 0x80"); | |||
//return false; | |||
} | |||
// Check the fixed offsets | |||
for (int i = 0; i < fixedOffset.length; i++) | |||
{ | |||
if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) | |||
{ | |||
iPosition = i; | |||
break; | |||
} | |||
} | |||
if (iPosition != 0) | |||
{ | |||
// DEBUG | |||
Debug.out("FIXED position is ", iPosition + 0xF9); | |||
// ch fits in a fixed offset window position | |||
dynamicOffset[iWin] = fixedOffset[iPosition]; | |||
iPosition += 0xF9; | |||
} | |||
else if (ch < 0x3400) | |||
{ | |||
// calculate a window position command and set the offset | |||
iPosition = ch >>> 7; | |||
dynamicOffset[iWin] = ch & 0xFF80; | |||
Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); | |||
} | |||
else if (ch < 0xE000) | |||
{ | |||
// attempt to place a window where none can go | |||
return false; | |||
} | |||
else if (ch <= 0xFFFF) | |||
{ | |||
// calculate a window position command, accounting | |||
// for the gap in position values, and set the offset | |||
iPosition = ((ch - gapOffset)>>> 7); | |||
dynamicOffset[iWin] = ch & 0xFF80; | |||
Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); | |||
} | |||
else | |||
{ | |||
// if we get here, the character is in the extended range. | |||
// Always use Window 4 to define an extended window | |||
iPosition = (ch - 0x10000) >>> 7; | |||
// DEBUG | |||
Debug.out("Try position Window at ", iPosition); | |||
iPosition |= iWin << 13; | |||
dynamicOffset[iWin] = ch & 0x1FFF80; | |||
} | |||
// Outputting window defintion command for the general cases | |||
if ( iPosition < 0x100 && iOut < out.length-1) | |||
{ | |||
out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin); | |||
out[iOut++] = (byte) (iPosition & 0xFF); | |||
} | |||
// Output an extended window definiton command | |||
else if ( iPosition >= 0x100 && iOut < out.length - 2) | |||
{ | |||
Debug.out("Setting extended window at ", iPosition); | |||
out[iOut++] = (fCurUnicodeMode ? UDX : SDX); | |||
out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF); | |||
out[iOut++] = (byte) (iPosition & 0xFF); | |||
} | |||
else | |||
{ | |||
throw new EndOfOutputException(); | |||
} | |||
selectWindow(iWin); | |||
iNextWindow++; | |||
return true; | |||
} | |||
/** | |||
compress a Unicode character array with some simplifying assumptions | |||
**/ | |||
public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut) | |||
throws IllegalInputException, EndOfInputException, EndOfOutputException | |||
{ | |||
iIn = iStartIn; | |||
iOut = iStartOut; | |||
while (iIn < in.length) | |||
{ | |||
int ch; | |||
// previously we switched to a Unicode run | |||
if (iSCU != -1) | |||
{ | |||
Debug.out("Remaining", in, iIn); | |||
Debug.out("Output until ["+iOut+"]: ", out); | |||
// output characters as Unicode | |||
ch = outputUnicodeRun(in, out); | |||
// for single character Unicode runs (3 bytes) use quote | |||
if (iOut - iSCU == 3 ) | |||
{ | |||
// go back and fix up the SCU to an SQU instead | |||
out[iSCU] = SQU; | |||
iSCU = -1; | |||
continue; | |||
} | |||
else | |||
{ | |||
iSCU = -1; | |||
fUnicodeMode = true; | |||
} | |||
} | |||
// next, try to output characters as single byte run | |||
else | |||
{ | |||
ch = outputSingleByteRun(in, out); | |||
} | |||
// check whether we still have input | |||
if (iIn == in.length) | |||
{ | |||
break; // no more input | |||
} | |||
// if we get here, we have a consistent value for ch, whether or | |||
// not it is an regular or extended character. Locate or define a | |||
// Window for the current character | |||
Debug.out("Output so far: ", out); | |||
Debug.out("Routing ch="+ch+" for Input", in, iIn); | |||
// Check that we have enough room to output the command byte | |||
if (iOut >= out.length - 1) | |||
{ | |||
throw new EndOfOutputException(); | |||
} | |||
// In order to switch away from Unicode mode, it is necessary | |||
// to select (or define) a window. If the characters that follow | |||
// the Unicode range are ASCII characters, we can't use them | |||
// to decide which window to select, since ASCII characters don't | |||
// influence window settings. This loop looks ahead until it finds | |||
// one compressible character that isn't in the ASCII range. | |||
for (int ich = iIn; ch < 0x80; ich++) | |||
{ | |||
if (ich == in.length || !isCompressible(in[ich])) | |||
{ | |||
// if there are only ASCII characters left, | |||
ch = in[iIn]; | |||
break; | |||
} | |||
ch = in[ich]; // lookahead for next non-ASCII char | |||
} | |||
// The character value contained in ch here will only be used to select | |||
// output modes. Actual output of characters starts with in[iIn] and | |||
// only takes place near the top of the loop. | |||
int iprevWindow = getCurrentWindow(); | |||
// try to locate a dynamic window | |||
if (ch < 0x80 || locateWindow(ch, dynamicOffset)) | |||
{ | |||
Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1); | |||
// lookahead to use SQn instead of SCn for single | |||
// character interruptions of runs in current window | |||
if(!fUnicodeMode && iIn < in.length -1) | |||
{ | |||
char ch2 = in[iIn+1]; | |||
if (ch2 >= dynamicOffset[iprevWindow] && | |||
ch2 < dynamicOffset[iprevWindow] + 0x80) | |||
{ | |||
quoteSingleByte(ch, out); | |||
selectWindow(iprevWindow); | |||
continue; | |||
} | |||
} | |||
out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow()); | |||
fUnicodeMode = false; | |||
} | |||
// try to locate a static window | |||
else if (!fUnicodeMode && locateWindow(ch, staticOffset)) | |||
{ | |||
// static windows are not accessible from Unicode mode | |||
Debug.out("located a static window", getCurrentWindow()); | |||
quoteSingleByte(ch, out); | |||
selectWindow(iprevWindow); // restore current Window settings | |||
continue; | |||
} | |||
// try to define a window around ch | |||
else if (positionWindow(ch, out, fUnicodeMode) ) | |||
{ | |||
fUnicodeMode = false; | |||
} | |||
// If all else fails, start a Unicode run | |||
else | |||
{ | |||
iSCU = iOut; | |||
out[iOut++] = SCU; | |||
continue; | |||
} | |||
} | |||
return iOut - iStartOut; | |||
} | |||
public byte[] compress(String inStr) | |||
throws IllegalInputException, EndOfInputException | |||
{ | |||
// Running out of room for output can cause non-optimal | |||
// compression. In order to not slow down compression too | |||
// much, not all intermediate state is constantly saved. | |||
byte [] out = new byte[inStr.length() * 2]; | |||
char [] in = inStr.toCharArray(); | |||
//DEBUG | |||
Debug.out("compress input: ",in); | |||
reset(); | |||
while(true) | |||
{ | |||
try | |||
{ | |||
simpleCompress(in, charsRead(), out, bytesWritten()); | |||
// if we get here things went fine. | |||
break; | |||
} | |||
catch (EndOfOutputException e) | |||
{ | |||
// create a larger output buffer and continue | |||
byte [] largerOut = new byte[out.length * 2]; | |||
System.arraycopy(out, 0, largerOut, 0, out.length); | |||
out = largerOut; | |||
} | |||
} | |||
byte [] trimmedOut = new byte[bytesWritten()]; | |||
System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length); | |||
out = trimmedOut; | |||
Debug.out("compress output: ", out); | |||
return out; | |||
} | |||
/** reset is only needed to bail out after an exception and | |||
restart with new input */ | |||
@Override | |||
public void reset() | |||
{ | |||
super.reset(); | |||
fUnicodeMode = false; | |||
iSCU = - 1; | |||
} | |||
/** returns the number of bytes written **/ | |||
public int bytesWritten() | |||
{ | |||
return iOut; | |||
} | |||
/** returns the number of bytes written **/ | |||
public int charsRead() | |||
{ | |||
return iIn; | |||
} | |||
} |
@@ -0,0 +1,48 @@ | |||
package com.healthmarketscience.jackcess.scsu; | |||
/** | |||
* This sample software accompanies Unicode Technical Report #6 and | |||
* distributed as is by Unicode, Inc., subject to the following: | |||
* | |||
* Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. | |||
* | |||
* Permission to use, copy, modify, and distribute this software | |||
* without fee is hereby granted provided that this copyright notice | |||
* appears in all copies. | |||
* | |||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE | |||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING | |||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, | |||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. | |||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND | |||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND | |||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING | |||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. | |||
* | |||
* @author Asmus Freytag | |||
* | |||
* @version 001 Dec 25 1996 | |||
* @version 002 Jun 25 1997 | |||
* @version 003 Jul 25 1997 | |||
* | |||
* Unicode and the Unicode logo are trademarks of Unicode, Inc., | |||
* and are registered in some jurisdictions. | |||
**/ | |||
/** | |||
* The input string or input byte array ended prematurely | |||
*/ | |||
public class EndOfOutputException | |||
extends java.lang.Exception | |||
{ | |||
private static final long serialVersionUID = 1L; | |||
public EndOfOutputException(){ | |||
super("The input string or input byte array ended prematurely"); | |||
} | |||
public EndOfOutputException(String s) { | |||
super(s); | |||
} | |||
} |
@@ -411,6 +411,7 @@ public class Expand extends SCSU | |||
/** reset is called to start with new input, w/o creating a new | |||
instance */ | |||
@Override | |||
public void reset() | |||
{ | |||
iOut = 0; |
@@ -0,0 +1,574 @@ | |||
package com.healthmarketscience.jackcess.scsu; | |||
import java.io.*; | |||
import java.util.*; | |||
/** | |||
* This sample software accompanies Unicode Technical Report #6 and | |||
* distributed as is by Unicode, Inc., subject to the following: | |||
* | |||
* Copyright 1996-1998 Unicode, Inc.. All Rights Reserved. | |||
* | |||
* Permission to use, copy, modify, and distribute this software | |||
* without fee is hereby granted provided that this copyright notice | |||
* appears in all copies. | |||
* | |||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE | |||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING | |||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, | |||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. | |||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND | |||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND | |||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING | |||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. | |||
* | |||
* @author Asmus Freytag | |||
* | |||
* @version 001 Dec 25 1996 | |||
* @version 002 Jun 25 1997 | |||
* @version 003 Jul 25 1997 | |||
* @version 004 Aug 25 1997 | |||
* @version 005 Sep 30 1998 | |||
* | |||
* Unicode and the Unicode logo are trademarks of Unicode, Inc., | |||
* and are registered in some jurisdictions. | |||
**/ | |||
/** | |||
Class CompressMain | |||
A small commandline driver interface for the compression routines | |||
Use the /? to get usage | |||
*/ | |||
public class CompressMain | |||
{ | |||
static void usage() | |||
{ | |||
System.err.println("java CompressMain /? : this usage information\n"); | |||
System.err.println("java CompressMain /random : random test\n"); | |||
System.err.println("java CompressMain /suite : suite test\n"); | |||
System.err.println("java CompressMain /suite <file> : file test (file data may include \\uXXXX)\n"); | |||
System.err.println("java CompressMain <string> : string test (string may include \\uXXXX)\n"); | |||
System.err.println("java CompressMain /roundtrip <file>: check Unicode file for roundtrip\n"); | |||
System.err.println("java CompressMain /compress <file> : compresses Unicode files (no \\uXXXX)\n"); | |||
System.err.println("java CompressMain /expand <file> : expands into Unicode files\n"); | |||
System.err.println("java CompressMain /byteswap <files>: swaps byte order of Unicode files\n"); | |||
System.err.println("java CompressMain /display <files> : like expand, but creates a dump instead\n"); | |||
System.err.println("java CompressMain /parse <files> : parses \\uXXXX into binary Unicode\n"); | |||
} | |||
static void analyze(String text, int inlength, String result, int outlength) | |||
{ | |||
boolean fSuccess = text.equals(result); | |||
Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED"); | |||
if (!fSuccess && result != null) | |||
{ | |||
int iLim = Math.min(text.length(), result.length()); | |||
for (int i = 0; i < iLim; i++) | |||
{ | |||
if (text.charAt(i) != result.charAt(i)) | |||
{ | |||
Debug.out("First Mismatch at "+ i +"=", result.charAt(i) ); | |||
Debug.out("Original character "+ i +"=", text.charAt(i) ); | |||
break; | |||
} | |||
} | |||
} | |||
else | |||
{ | |||
Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes."); | |||
Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%."); | |||
} | |||
} | |||
static void test2(String text) | |||
{ | |||
byte bytes[] = null; | |||
String result = null; | |||
Debug.out("SCSU:\n"); | |||
Compress compressor = new Compress(); | |||
try | |||
{ | |||
bytes = compressor.compress(text); | |||
Expand display = new Expand(); | |||
result = display.expand(bytes); | |||
Debug.out("Input: ", text.toCharArray()); | |||
Debug.out("Result: ", result.toCharArray()); | |||
Debug.out(""); | |||
Expand expander = new Expand(); | |||
result = expander.expand(bytes); | |||
} | |||
catch (Exception e) | |||
{ | |||
System.out.println(e); | |||
} | |||
int inlength = compressor.charsRead(); | |||
int outlength = compressor.bytesWritten(); | |||
analyze(text, inlength, result, outlength); | |||
} | |||
static void test(String text) throws Exception | |||
{ | |||
test(text, false); | |||
} | |||
static void test(String text, boolean shouldFail) | |||
throws Exception | |||
{ | |||
// Create an instance of the compressor | |||
Compress compressor = new Compress(); | |||
byte [] bytes = null; | |||
String result = null; | |||
Exception failure = null; | |||
try { | |||
// perform compression | |||
bytes = compressor.compress(text); | |||
} | |||
catch(Exception e) | |||
{ | |||
failure = e; | |||
} | |||
if(shouldFail) { | |||
if(failure == null) { | |||
throw new RuntimeException("Did not fail"); | |||
} | |||
return; | |||
} | |||
if(failure != null) { | |||
throw failure; | |||
} | |||
Expand expander = new Expand(); | |||
// perform expansion | |||
result = expander.expand(bytes); | |||
// analyze the results | |||
int inlength = compressor.charsRead(); | |||
int outlength = compressor.bytesWritten(); | |||
analyze(text, inlength, result, outlength); | |||
} | |||
public static void display(byte [] input) | |||
{ | |||
try | |||
{ | |||
Expand expand = new Expand(); | |||
String text = expand.expand(input); | |||
Debug.out(text.toCharArray()); | |||
} | |||
catch (Exception e) | |||
{ | |||
System.out.println(e); | |||
} | |||
} | |||
public static String parse(String input) | |||
{ | |||
StringTokenizer st = new StringTokenizer(input, "\\", true); | |||
Debug.out("Input: ", input); | |||
StringBuffer sb = new StringBuffer(); | |||
while(st.hasMoreTokens()) | |||
{ | |||
String token = st.nextToken(); | |||
Debug.out("Token: ", token); | |||
if (token.charAt(0) == '\\' && token.length() == 1) | |||
{ | |||
if(st.hasMoreTokens()) | |||
{ | |||
token = st.nextToken(); | |||
} | |||
if(token.charAt(0) == 'u') | |||
{ | |||
Debug.out("Token: "+ token+ " ", sb.toString()); | |||
String hexnum; | |||
if (token.length() > 5) | |||
{ | |||
hexnum = token.substring(1,5); | |||
token = token.substring(5); | |||
} | |||
else | |||
{ | |||
hexnum = token.substring(1); | |||
token = ""; | |||
} | |||
sb.append((char)Integer.parseInt(hexnum, 16)); | |||
} | |||
} | |||
sb.append(token); | |||
} | |||
return sb.toString(); | |||
} | |||
public static void randomTest(int nTest) | |||
throws Exception | |||
{ | |||
Random random = new Random(); | |||
for(int n=0; n < nTest; n++) | |||
{ | |||
int iLen = (int) (20 * random.nextFloat()); | |||
StringBuffer sb = new StringBuffer(iLen); | |||
for(int i = 0; i < iLen; i++) | |||
{ | |||
sb.append((char) (0xFFFF * random.nextFloat())); | |||
} | |||
test(sb.toString()); | |||
} | |||
} | |||
@SuppressWarnings("deprecation") | |||
public static void fileTest(String name) | |||
throws Exception | |||
{ | |||
DataInputStream dis = new DataInputStream(new FileInputStream(name)); | |||
int iLine = 0; | |||
while(dis.available() != 0) | |||
{ | |||
String line = dis.readLine(); | |||
Debug.out("Line "+ iLine++ +" "+line); | |||
test(parse(line), false ); //false);// initially no debug info | |||
} | |||
} | |||
public static void displayFile(String name) | |||
throws IOException | |||
{ | |||
DataInputStream dis = new DataInputStream(new FileInputStream(name)); | |||
byte bytes[] = new byte[dis.available()]; | |||
dis.read(bytes); | |||
display(bytes); | |||
} | |||
public static void decodeTest(String name) | |||
throws IOException | |||
{ | |||
DataInputStream dis = new DataInputStream(new FileInputStream(name)); | |||
byte bytes[] = new byte[dis.available()]; | |||
dis.read(bytes); | |||
Expand expand = new Expand(); | |||
char [] chars = null; | |||
try | |||
{ | |||
String text = expand.expand(bytes); | |||
chars = text.toCharArray(); | |||
} | |||
catch (Exception e) | |||
{ | |||
System.out.println(e); | |||
} | |||
int inlength = expand.bytesRead(); | |||
int iDot = name.lastIndexOf('.'); | |||
StringBuffer sb = new StringBuffer(name); | |||
sb.setLength(iDot + 1); | |||
sb.append("txt"); | |||
String outName = sb.toString(); | |||
int outlength = expand.charsWritten(); | |||
Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%."); | |||
if (chars == null) | |||
return; | |||
writeUnicodeFile(outName, chars); | |||
} | |||
/** most of the next 3 functions should not be needed by JDK11 and later */ | |||
private static int iMSB = 1; | |||
public static String readUnicodeFile(String name) | |||
{ | |||
try | |||
{ | |||
FileInputStream dis = new FileInputStream(name); | |||
byte b[] = new byte[2]; | |||
StringBuffer sb = new StringBuffer(); | |||
char ch = 0; | |||
iMSB = 1; | |||
int i = 0; | |||
for(i = 0; (dis.available() != 0); i++) | |||
{ | |||
b[i%2] = (byte) dis.read(); | |||
if ((i & 1) == 1) | |||
{ | |||
ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]); | |||
} | |||
else | |||
{ | |||
continue; | |||
} | |||
if (i == 1 && ch == '\uFEFF') | |||
continue; // throw away byte order mark | |||
if (i == 1 && ch == '\uFFFE') | |||
{ | |||
iMSB ++; // flip byte order | |||
continue; // throw away byte order mark | |||
} | |||
sb.append(ch); | |||
} | |||
return sb.toString(); | |||
} | |||
catch (IOException e) | |||
{ | |||
System.err.println(e); | |||
return ""; | |||
} | |||
} | |||
public static void writeUnicodeFile(String outName, char [] chars) | |||
throws IOException | |||
{ | |||
DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName)); | |||
if ((iMSB & 1) == 1) | |||
{ | |||
dos.writeByte(0xFF); | |||
dos.writeByte(0xFE); | |||
} | |||
else | |||
{ | |||
dos.writeByte(0xFE); | |||
dos.writeByte(0xFF); | |||
} | |||
byte b[] = new byte[2]; | |||
for (int ich = 0; ich < chars.length; ich++) | |||
{ | |||
b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8); | |||
b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF); | |||
dos.write(b, 0, 2); | |||
} | |||
} | |||
static void byteswap(String name) | |||
throws IOException | |||
{ | |||
String text = readUnicodeFile(name); | |||
char chars[] = text.toCharArray(); | |||
writeUnicodeFile(name, chars); | |||
} | |||
@SuppressWarnings("deprecation") | |||
public static void parseFile(String name) | |||
throws IOException | |||
{ | |||
DataInputStream dis = new DataInputStream(new FileInputStream(name)); | |||
byte bytes[] = new byte[dis.available()]; | |||
dis.read(bytes); | |||
// simplistic test | |||
int bom = (char) bytes[0] + (char) bytes[1]; | |||
if (bom == 131069) | |||
{ | |||
// FEFF or FFFE detected (either one sums to 131069) | |||
Debug.out(name + " is already in Unicode!"); | |||
return; | |||
} | |||
// definitely assumes an ASCII file at this point | |||
String text = new String(bytes, 0); | |||
char chars[] = parse(text).toCharArray(); | |||
writeUnicodeFile(name, chars); | |||
return; | |||
} | |||
public static void encodeTest(String name) | |||
throws Exception | |||
{ | |||
String text = readUnicodeFile(name); | |||
// Create an instance of the compressor | |||
Compress compressor = new Compress(); | |||
byte [] bytes = null; | |||
// perform compression | |||
bytes = compressor.compress(text); | |||
int inlength = compressor.charsRead(); | |||
int iDot = name.lastIndexOf('.'); | |||
StringBuffer sb = new StringBuffer(name); | |||
sb.setLength(iDot + 1); | |||
sb.append("csu"); | |||
String outName = sb.toString(); | |||
DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName)); | |||
dos.write(bytes, 0, bytes.length); | |||
int outlength = compressor.bytesWritten(); | |||
Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%."); | |||
} | |||
public static void roundtripTest(String name) | |||
throws Exception | |||
{ | |||
test(readUnicodeFile(name), false);// no debug info | |||
} | |||
/** The Main function */ | |||
public static void main(String args[]) | |||
throws Exception | |||
{ | |||
int iArg = args.length; | |||
try | |||
{ | |||
if (iArg != 0) | |||
{ | |||
if (args[0].equalsIgnoreCase("/compress")) | |||
{ | |||
while (--iArg > 0) | |||
{ | |||
encodeTest(args[args.length - iArg]); | |||
} | |||
} | |||
else if (args[0].equalsIgnoreCase("/parse")) | |||
{ | |||
while (--iArg > 0) | |||
{ | |||
parseFile(args[args.length - iArg]); | |||
} | |||
} | |||
else if (args[0].equalsIgnoreCase("/expand")) | |||
{ | |||
while (--iArg > 0) | |||
{ | |||
decodeTest(args[args.length - iArg]); | |||
} | |||
} | |||
else if (args[0].equalsIgnoreCase("/display")) | |||
{ | |||
while (--iArg > 0) | |||
{ | |||
displayFile(args[args.length - iArg]); | |||
} | |||
} | |||
else if (args[0].equalsIgnoreCase("/roundtrip")) | |||
{ | |||
while (--iArg > 0) | |||
{ | |||
roundtripTest(args[args.length - iArg]); | |||
} | |||
} | |||
else if (args[0].equalsIgnoreCase("/byteswap")) | |||
{ | |||
while (--iArg > 0) | |||
{ | |||
byteswap(args[args.length - iArg]); | |||
} | |||
}else if (args[0].equalsIgnoreCase("/random")) | |||
{ | |||
randomTest(8); | |||
} | |||
else if (args[0].equalsIgnoreCase("/suite")) | |||
{ | |||
if (iArg == 1) | |||
{ | |||
suiteTest(); | |||
} | |||
else | |||
{ | |||
while (--iArg > 0) | |||
{ | |||
fileTest(args[args.length - iArg]); | |||
} | |||
} | |||
} | |||
else if (args[0].equalsIgnoreCase("/?")) | |||
{ | |||
usage(); | |||
} | |||
else | |||
{ | |||
while (iArg > 0) | |||
{ | |||
test2(parse(args[--iArg])); | |||
} | |||
} | |||
} | |||
else | |||
{ | |||
usage(); | |||
} | |||
} | |||
catch (IOException e) | |||
{ | |||
System.err.println(e); | |||
} | |||
try | |||
{ | |||
System.err.println("Done. Press enter to exit"); | |||
System.in.read(); | |||
} | |||
catch (IOException e) | |||
{ | |||
} | |||
} | |||
static void suiteTest() | |||
throws Exception | |||
{ | |||
Debug.out("Standard Compression test suite:"); | |||
test("Hello \u9292 \u9192 World!"); | |||
test("Hell\u0429o \u9292 \u9192 W\u00e4rld!"); | |||
test("Hell\u0429o \u9292 \u9292W\u00e4rld!"); | |||
test("\u0648\u06c8"); // catch missing reset | |||
test("\u0648\u06c8"); | |||
test("\u4444\uE001"); // lowest quotable | |||
test("\u4444\uf2FF"); // highest quotable | |||
test("\u4444\uf188\u4444"); | |||
test("\u4444\uf188\uf288"); | |||
test("\u4444\uf188abc\0429\uf288"); | |||
test("\u9292\u2222"); | |||
test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!"); | |||
test("Hell\u0429o \u9292 \u9292W\u00e4rld!"); | |||
test("Hello World!123456"); | |||
test("Hello W\u0081\u011f\u0082!"); // Latin 1 run | |||
test("abc\u0301\u0302"); // uses SQn for u301 u302 | |||
test("abc\u4411d"); // uses SQU | |||
test("abc\u4411\u4412d");// uses SCU | |||
test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5 | |||
test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data | |||
test("\u9292\u2222"); | |||
test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); | |||
test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c"); | |||
test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002"); | |||
test(""); // empty input | |||
test("\u0000"); // smallest BMP character | |||
test("\uFFFF"); // largest BMP character | |||
test("\ud800\udc00"); // smallest surrogate | |||
test("\ud8ff\udcff"); // largest surrogate pair | |||
Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:"); | |||
test("\ud800 \udc00", true); // unpaired surrogate (1) | |||
test("\udc00", true); // unpaired surrogate (2) | |||
test("\ud800", true); // unpaired surrogate (3) | |||
} | |||
} |
@@ -0,0 +1,47 @@ | |||
/* | |||
Copyright (c) 2007 Health Market Science, Inc. | |||
This library is free software; you can redistribute it and/or | |||
modify it under the terms of the GNU Lesser General Public | |||
License as published by the Free Software Foundation; either | |||
version 2.1 of the License, or (at your option) any later version. | |||
This library is distributed in the hope that it will be useful, | |||
but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
Lesser General Public License for more details. | |||
You should have received a copy of the GNU Lesser General Public | |||
License along with this library; if not, write to the Free Software | |||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |||
USA | |||
You can contact Health Market Science at info@healthmarketscience.com | |||
or at the following address: | |||
Health Market Science | |||
2700 Horizon Drive | |||
Suite 200 | |||
King of Prussia, PA 19406 | |||
*/ | |||
package com.healthmarketscience.jackcess.scsu; | |||
import junit.framework.TestCase; | |||
/** | |||
* @author James Ahlborn | |||
*/ | |||
public class CompressTest extends TestCase | |||
{ | |||
public CompressTest(String name) throws Exception { | |||
super(name); | |||
} | |||
public void testCompression() throws Exception | |||
{ | |||
CompressMain.suiteTest(); | |||
} | |||
} |