tests. git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@365 f203690c-595d-4dc9-a70b-905162fa7fd2tags/rel_1_1_16
<action dev="jahlborn" type="add"> | <action dev="jahlborn" type="add"> | ||||
Add primitive support for writing unicode compressed text columns. | Add primitive support for writing unicode compressed text columns. | ||||
</action> | </action> | ||||
<action dev="jahlborn" type="add"> | |||||
Add compression code for possible future use; add compression unit | |||||
tests. | |||||
</action> | |||||
</release> | </release> | ||||
<release version="1.1.15" date="2008-06-27"> | <release version="1.1.15" date="2008-06-27"> | ||||
<action dev="jahlborn" type="fix" issue="1998225"> | <action dev="jahlborn" type="fix" issue="1998225"> |
import java.util.regex.Matcher; | import java.util.regex.Matcher; | ||||
import java.util.regex.Pattern; | import java.util.regex.Pattern; | ||||
import com.healthmarketscience.jackcess.scsu.Compress; | |||||
import com.healthmarketscience.jackcess.scsu.EndOfInputException; | import com.healthmarketscience.jackcess.scsu.EndOfInputException; | ||||
import com.healthmarketscience.jackcess.scsu.Expand; | import com.healthmarketscience.jackcess.scsu.Expand; | ||||
import com.healthmarketscience.jackcess.scsu.IllegalInputException; | import com.healthmarketscience.jackcess.scsu.IllegalInputException; | ||||
// now, see if it is all printable ASCII | // now, see if it is all printable ASCII | ||||
for(int i = 0; i < text.length(); ++i) { | for(int i = 0; i < text.length(); ++i) { | ||||
char c = text.charAt(i); | char c = text.charAt(i); | ||||
if(!isAsciiCrLfOrTab(c)) { | |||||
if(!Compress.isAsciiCrLfOrTab(c)) { | |||||
return false; | return false; | ||||
} | } | ||||
} | } | ||||
return true; | return true; | ||||
} | } | ||||
/** | |||||
* Returns true if the character is ASCII, but not a control other than | |||||
* CR, LF and TAB | |||||
*/ | |||||
private static boolean isAsciiCrLfOrTab(int ch) | |||||
{ | |||||
return ((ch >= 0x20 && ch <= 0x7F) // ASCII (non control) | |||||
|| ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB | |||||
} | |||||
@Override | @Override | ||||
public String toString() { | public String toString() { | ||||
StringBuilder rtn = new StringBuilder(); | StringBuilder rtn = new StringBuilder(); |
package com.healthmarketscience.jackcess.scsu; | |||||
/** | |||||
* This sample software accompanies Unicode Technical Report #6 and | |||||
* distributed as is by Unicode, Inc., subject to the following: | |||||
* | |||||
* Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. | |||||
* | |||||
* Permission to use, copy, modify, and distribute this software | |||||
* without fee is hereby granted provided that this copyright notice | |||||
* appears in all copies. | |||||
* | |||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE | |||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING | |||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, | |||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. | |||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND | |||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND | |||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING | |||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. | |||||
* | |||||
* @author Asmus Freytag | |||||
* | |||||
* @version 001 Dec 25 1996 | |||||
* @version 002 Jun 25 1997 | |||||
* @version 003 Jul 25 1997 | |||||
* @version 004 Aug 25 1997 | |||||
* | |||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc., | |||||
* and are registered in some jurisdictions. | |||||
**/ | |||||
/** | |||||
This class implements a simple compression algorithm | |||||
**/ | |||||
/* | |||||
Note on exception handling | |||||
This compressor is designed so that it can be restarted after | |||||
an exception. All operations advancing input and/or output cursor | |||||
(iIn and iOut) either complete an action, or set a state (fUnicodeMode) | |||||
before updating the cursors. | |||||
*/ | |||||
public class Compress extends SCSU | |||||
{ | |||||
/** next input character to be read **/ | |||||
private int iIn; | |||||
/** next output byte to be written **/ | |||||
private int iOut; | |||||
/** start index of Unicode mode in output array, or -1 if in single byte mode **/ | |||||
private int iSCU = -1; | |||||
/** true if the next command byte is of the Uxx family */ | |||||
private boolean fUnicodeMode = false; | |||||
/** locate a window for a character given a table of offsets | |||||
@param ch - character | |||||
@param offsetTable - table of window offsets | |||||
@return true if the character fits a window from the table of windows */ | |||||
private boolean locateWindow(int ch, int[] offsetTable) | |||||
{ | |||||
// always try the current window first | |||||
int iWin = getCurrentWindow(); | |||||
// if the character fits the current window | |||||
// just use the current window | |||||
if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) | |||||
{ | |||||
return true; | |||||
} | |||||
// try all windows in order | |||||
for (iWin = 0; iWin < offsetTable.length; iWin++) | |||||
{ | |||||
if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) | |||||
{ | |||||
selectWindow(iWin); | |||||
return true; | |||||
} | |||||
} | |||||
// none found | |||||
return false; | |||||
} | |||||
/** returns true if the character is ASCII, but not a control other than CR, LF and TAB */ | |||||
public static boolean isAsciiCrLfOrTab(int ch) | |||||
{ | |||||
return (ch >= 0x20 && ch <= 0x7F) // ASCII | |||||
|| ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB | |||||
} | |||||
/** output a run of characters in single byte mode | |||||
In single byte mode pass through characters in the ASCII range, but | |||||
quote characters overlapping with compression command codes. Runs | |||||
of characters fitting the current window are output as runs of bytes | |||||
in the range 0x80-0xFF. Checks for and validates Surrogate Pairs. | |||||
Uses and updates the current input and output cursors store in | |||||
the instance variables <i>iIn</i> and <i>iOut</i>. | |||||
@param in - input character array | |||||
@param out - output byte array | |||||
@return the next chaacter to be processed. This may be an extended character. | |||||
**/ | |||||
@SuppressWarnings("fallthrough") | |||||
public int outputSingleByteRun(char [] in, byte [] out) | |||||
throws EndOfOutputException, EndOfInputException, IllegalInputException | |||||
{ | |||||
int iWin = getCurrentWindow(); | |||||
while(iIn < in.length) | |||||
{ | |||||
int outlen = 0; | |||||
byte byte1 = 0; | |||||
byte byte2 = 0; | |||||
// get the input character | |||||
int ch = in[iIn]; | |||||
int inlen = 1; | |||||
// Check input for Surrogate pair | |||||
if ( (ch & 0xF800) == 0xD800 ) | |||||
{ | |||||
if ( (ch & 0xFC00) == 0xDC00 ) | |||||
{ | |||||
// low surrogate out of order | |||||
throw new IllegalInputException("Unpaired low surrogate: "+iIn); | |||||
} | |||||
else | |||||
{ | |||||
// have high surrogate now get low surrogate | |||||
if ( iIn >= in.length-1) | |||||
{ | |||||
// premature end of input | |||||
throw new EndOfInputException(); | |||||
} | |||||
// get the char | |||||
int ch2 = in[iIn+1]; | |||||
// make sure it's a low surrogate | |||||
if ( (ch2 & 0xFC00) != 0xDC00 ) | |||||
{ | |||||
// a low surrogate was required | |||||
throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1)); | |||||
} | |||||
// combine the two values | |||||
ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; | |||||
// ch = ch<<10 + ch2 - 0x36F0000; | |||||
inlen = 2; | |||||
} | |||||
} | |||||
// ASCII Letter, NUL, CR, LF and TAB are always passed through | |||||
if (isAsciiCrLfOrTab(ch) || ch == 0) | |||||
{ | |||||
// pass through directcly | |||||
byte2 = (byte)(ch & 0x7F); | |||||
outlen = 1; | |||||
} | |||||
// All other control codes must be quoted | |||||
else if (ch < 0x20) | |||||
{ | |||||
byte1 = SQ0; | |||||
byte2 = (byte)(ch); | |||||
outlen = 2; | |||||
} | |||||
// Letters that fit the current dynamic window | |||||
else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) | |||||
{ | |||||
ch -= dynamicOffset[iWin]; | |||||
byte2 = (byte)(ch | 0x80); | |||||
outlen = 1; | |||||
} | |||||
// check for room in the output array | |||||
if (iOut + outlen >= out.length) | |||||
{ | |||||
throw new EndOfOutputException(); | |||||
} | |||||
switch(outlen) | |||||
{ | |||||
default: | |||||
// need to use some other compression mode for this | |||||
// character so we terminate this loop | |||||
return ch; // input not finished | |||||
// output the characters | |||||
case 2: | |||||
out[iOut++] = byte1; | |||||
// fall through | |||||
case 1: | |||||
out[iOut++] = byte2; | |||||
break; | |||||
} | |||||
// advance input pointer | |||||
iIn += inlen; | |||||
} | |||||
return 0; // input all used up | |||||
} | |||||
/** quote a single character in single byte mode | |||||
Quoting a character (aka 'non-locking shift') gives efficient access | |||||
to characters that occur in isolation--usually punctuation characters. | |||||
When quoting a character from a dynamic window use 0x80 - 0xFF, when | |||||
quoting a character from a static window use 0x00-0x7f. | |||||
@param ch - character to be quoted | |||||
@param out - output byte array | |||||
**/ | |||||
private void quoteSingleByte(int ch, byte [] out) | |||||
throws EndOfOutputException | |||||
{ | |||||
Debug.out("Quoting SingleByte ", ch); | |||||
int iWin = getCurrentWindow(); | |||||
// check for room in the output array | |||||
if (iOut >= out.length -2) | |||||
{ | |||||
throw new EndOfOutputException(); | |||||
} | |||||
// Output command byte followed by | |||||
out[iOut++] = (byte)(SQ0 + iWin); | |||||
// Letter that fits the current dynamic window | |||||
if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) | |||||
{ | |||||
ch -= dynamicOffset[iWin]; | |||||
out[iOut++] = (byte)(ch | 0x80); | |||||
} | |||||
// Letter that fits the current static window | |||||
else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) | |||||
{ | |||||
ch -= staticOffset[iWin]; | |||||
out[iOut++] = (byte)ch; | |||||
} | |||||
else | |||||
{ | |||||
throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error"); | |||||
} | |||||
// advance input pointer | |||||
iIn ++; | |||||
Debug.out("New input: ", iIn); | |||||
} | |||||
/** output a run of characters in Unicode mode | |||||
A run of Unicode mode consists of characters which are all in the | |||||
range of non-compressible characters or isolated occurrence | |||||
of any other characters. Characters in the range 0xE00-0xF2FF must | |||||
be quoted to avoid overlap with the Unicode mode compression command codes. | |||||
Uses and updates the current input and output cursors store in | |||||
the instance variables <i>iIn</i> and <i>iOut</i>. | |||||
NOTE: Characters from surrogate pairs are passed through and unlike single | |||||
byte mode no checks are made for unpaired surrogate characters. | |||||
@param in - input character array | |||||
@param out - output byte array | |||||
@return the next input character to be processed | |||||
**/ | |||||
public char outputUnicodeRun(char [] in, byte [] out) | |||||
throws EndOfOutputException | |||||
{ | |||||
// current character | |||||
char ch = 0; | |||||
while(iIn < in.length) | |||||
{ | |||||
// get current input and set default output length | |||||
ch = in[iIn]; | |||||
int outlen = 2; | |||||
// Characters in these ranges could potentially be compressed. | |||||
// We require 2 or more compressible characters to break the run | |||||
if (isCompressible(ch)) | |||||
{ | |||||
// check whether we can look ahead | |||||
if( iIn < in.length - 1) | |||||
{ | |||||
// DEBUG | |||||
Debug.out("is-comp: ",ch); | |||||
char ch2 = in[iIn + 1]; | |||||
if (isCompressible(ch2)) | |||||
{ | |||||
// at least 2 characters are compressible | |||||
// break the run | |||||
break; | |||||
} | |||||
//DEBUG | |||||
Debug.out("no-comp: ",ch2); | |||||
} | |||||
// If we get here, the current character is only character | |||||
// left in the input or it is followed by a non-compressible | |||||
// character. In neither case do we gain by breaking the | |||||
// run, so we proceed to output the character. | |||||
if (ch >= 0xE000 && ch <= 0xF2FF) | |||||
{ | |||||
// Characters in this range need to be escaped | |||||
outlen = 3; | |||||
} | |||||
} | |||||
// check that there is enough room to output the character | |||||
if(iOut >= out.length - outlen) | |||||
{ | |||||
// DEBUG | |||||
Debug.out("End of Output @", iOut); | |||||
// if we got here, we ran out of space in the output array | |||||
throw new EndOfOutputException(); | |||||
} | |||||
// output any characters that cannot be compressed, | |||||
if (outlen == 3) | |||||
{ | |||||
// output the quote character | |||||
out[iOut++] = UQU; | |||||
} | |||||
// pass the Unicode character in MSB,LSB order | |||||
out[iOut++] = (byte)(ch >>> 8); | |||||
out[iOut++] = (byte)(ch & 0xFF); | |||||
// advance input cursor | |||||
iIn++; | |||||
} | |||||
// return the last character | |||||
return ch; | |||||
} | |||||
static int iNextWindow = 3; | |||||
/** redefine a window so it surrounds a given character value | |||||
For now, this function uses window 3 exclusively (window 4 | |||||
for extended windows); | |||||
@return true if a window was successfully defined | |||||
@param ch - character around which window is positioned | |||||
@param out - output byte array | |||||
@param fCurUnicodeMode - type of window | |||||
**/ | |||||
private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode) | |||||
throws IllegalInputException, EndOfOutputException | |||||
{ | |||||
int iWin = iNextWindow % 8; // simple LRU | |||||
int iPosition = 0; | |||||
// iPosition 0 is a reserved value | |||||
if (ch < 0x80) | |||||
{ | |||||
throw new IllegalStateException("ch < 0x80"); | |||||
//return false; | |||||
} | |||||
// Check the fixed offsets | |||||
for (int i = 0; i < fixedOffset.length; i++) | |||||
{ | |||||
if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) | |||||
{ | |||||
iPosition = i; | |||||
break; | |||||
} | |||||
} | |||||
if (iPosition != 0) | |||||
{ | |||||
// DEBUG | |||||
Debug.out("FIXED position is ", iPosition + 0xF9); | |||||
// ch fits in a fixed offset window position | |||||
dynamicOffset[iWin] = fixedOffset[iPosition]; | |||||
iPosition += 0xF9; | |||||
} | |||||
else if (ch < 0x3400) | |||||
{ | |||||
// calculate a window position command and set the offset | |||||
iPosition = ch >>> 7; | |||||
dynamicOffset[iWin] = ch & 0xFF80; | |||||
Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); | |||||
} | |||||
else if (ch < 0xE000) | |||||
{ | |||||
// attempt to place a window where none can go | |||||
return false; | |||||
} | |||||
else if (ch <= 0xFFFF) | |||||
{ | |||||
// calculate a window position command, accounting | |||||
// for the gap in position values, and set the offset | |||||
iPosition = ((ch - gapOffset)>>> 7); | |||||
dynamicOffset[iWin] = ch & 0xFF80; | |||||
Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); | |||||
} | |||||
else | |||||
{ | |||||
// if we get here, the character is in the extended range. | |||||
// Always use Window 4 to define an extended window | |||||
iPosition = (ch - 0x10000) >>> 7; | |||||
// DEBUG | |||||
Debug.out("Try position Window at ", iPosition); | |||||
iPosition |= iWin << 13; | |||||
dynamicOffset[iWin] = ch & 0x1FFF80; | |||||
} | |||||
// Outputting window defintion command for the general cases | |||||
if ( iPosition < 0x100 && iOut < out.length-1) | |||||
{ | |||||
out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin); | |||||
out[iOut++] = (byte) (iPosition & 0xFF); | |||||
} | |||||
// Output an extended window definiton command | |||||
else if ( iPosition >= 0x100 && iOut < out.length - 2) | |||||
{ | |||||
Debug.out("Setting extended window at ", iPosition); | |||||
out[iOut++] = (fCurUnicodeMode ? UDX : SDX); | |||||
out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF); | |||||
out[iOut++] = (byte) (iPosition & 0xFF); | |||||
} | |||||
else | |||||
{ | |||||
throw new EndOfOutputException(); | |||||
} | |||||
selectWindow(iWin); | |||||
iNextWindow++; | |||||
return true; | |||||
} | |||||
/** | |||||
compress a Unicode character array with some simplifying assumptions | |||||
**/ | |||||
public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut) | |||||
throws IllegalInputException, EndOfInputException, EndOfOutputException | |||||
{ | |||||
iIn = iStartIn; | |||||
iOut = iStartOut; | |||||
while (iIn < in.length) | |||||
{ | |||||
int ch; | |||||
// previously we switched to a Unicode run | |||||
if (iSCU != -1) | |||||
{ | |||||
Debug.out("Remaining", in, iIn); | |||||
Debug.out("Output until ["+iOut+"]: ", out); | |||||
// output characters as Unicode | |||||
ch = outputUnicodeRun(in, out); | |||||
// for single character Unicode runs (3 bytes) use quote | |||||
if (iOut - iSCU == 3 ) | |||||
{ | |||||
// go back and fix up the SCU to an SQU instead | |||||
out[iSCU] = SQU; | |||||
iSCU = -1; | |||||
continue; | |||||
} | |||||
else | |||||
{ | |||||
iSCU = -1; | |||||
fUnicodeMode = true; | |||||
} | |||||
} | |||||
// next, try to output characters as single byte run | |||||
else | |||||
{ | |||||
ch = outputSingleByteRun(in, out); | |||||
} | |||||
// check whether we still have input | |||||
if (iIn == in.length) | |||||
{ | |||||
break; // no more input | |||||
} | |||||
// if we get here, we have a consistent value for ch, whether or | |||||
// not it is an regular or extended character. Locate or define a | |||||
// Window for the current character | |||||
Debug.out("Output so far: ", out); | |||||
Debug.out("Routing ch="+ch+" for Input", in, iIn); | |||||
// Check that we have enough room to output the command byte | |||||
if (iOut >= out.length - 1) | |||||
{ | |||||
throw new EndOfOutputException(); | |||||
} | |||||
// In order to switch away from Unicode mode, it is necessary | |||||
// to select (or define) a window. If the characters that follow | |||||
// the Unicode range are ASCII characters, we can't use them | |||||
// to decide which window to select, since ASCII characters don't | |||||
// influence window settings. This loop looks ahead until it finds | |||||
// one compressible character that isn't in the ASCII range. | |||||
for (int ich = iIn; ch < 0x80; ich++) | |||||
{ | |||||
if (ich == in.length || !isCompressible(in[ich])) | |||||
{ | |||||
// if there are only ASCII characters left, | |||||
ch = in[iIn]; | |||||
break; | |||||
} | |||||
ch = in[ich]; // lookahead for next non-ASCII char | |||||
} | |||||
// The character value contained in ch here will only be used to select | |||||
// output modes. Actual output of characters starts with in[iIn] and | |||||
// only takes place near the top of the loop. | |||||
int iprevWindow = getCurrentWindow(); | |||||
// try to locate a dynamic window | |||||
if (ch < 0x80 || locateWindow(ch, dynamicOffset)) | |||||
{ | |||||
Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1); | |||||
// lookahead to use SQn instead of SCn for single | |||||
// character interruptions of runs in current window | |||||
if(!fUnicodeMode && iIn < in.length -1) | |||||
{ | |||||
char ch2 = in[iIn+1]; | |||||
if (ch2 >= dynamicOffset[iprevWindow] && | |||||
ch2 < dynamicOffset[iprevWindow] + 0x80) | |||||
{ | |||||
quoteSingleByte(ch, out); | |||||
selectWindow(iprevWindow); | |||||
continue; | |||||
} | |||||
} | |||||
out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow()); | |||||
fUnicodeMode = false; | |||||
} | |||||
// try to locate a static window | |||||
else if (!fUnicodeMode && locateWindow(ch, staticOffset)) | |||||
{ | |||||
// static windows are not accessible from Unicode mode | |||||
Debug.out("located a static window", getCurrentWindow()); | |||||
quoteSingleByte(ch, out); | |||||
selectWindow(iprevWindow); // restore current Window settings | |||||
continue; | |||||
} | |||||
// try to define a window around ch | |||||
else if (positionWindow(ch, out, fUnicodeMode) ) | |||||
{ | |||||
fUnicodeMode = false; | |||||
} | |||||
// If all else fails, start a Unicode run | |||||
else | |||||
{ | |||||
iSCU = iOut; | |||||
out[iOut++] = SCU; | |||||
continue; | |||||
} | |||||
} | |||||
return iOut - iStartOut; | |||||
} | |||||
public byte[] compress(String inStr) | |||||
throws IllegalInputException, EndOfInputException | |||||
{ | |||||
// Running out of room for output can cause non-optimal | |||||
// compression. In order to not slow down compression too | |||||
// much, not all intermediate state is constantly saved. | |||||
byte [] out = new byte[inStr.length() * 2]; | |||||
char [] in = inStr.toCharArray(); | |||||
//DEBUG | |||||
Debug.out("compress input: ",in); | |||||
reset(); | |||||
while(true) | |||||
{ | |||||
try | |||||
{ | |||||
simpleCompress(in, charsRead(), out, bytesWritten()); | |||||
// if we get here things went fine. | |||||
break; | |||||
} | |||||
catch (EndOfOutputException e) | |||||
{ | |||||
// create a larger output buffer and continue | |||||
byte [] largerOut = new byte[out.length * 2]; | |||||
System.arraycopy(out, 0, largerOut, 0, out.length); | |||||
out = largerOut; | |||||
} | |||||
} | |||||
byte [] trimmedOut = new byte[bytesWritten()]; | |||||
System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length); | |||||
out = trimmedOut; | |||||
Debug.out("compress output: ", out); | |||||
return out; | |||||
} | |||||
/** reset is only needed to bail out after an exception and | |||||
restart with new input */ | |||||
@Override | |||||
public void reset() | |||||
{ | |||||
super.reset(); | |||||
fUnicodeMode = false; | |||||
iSCU = - 1; | |||||
} | |||||
/** returns the number of bytes written **/ | |||||
public int bytesWritten() | |||||
{ | |||||
return iOut; | |||||
} | |||||
/** returns the number of bytes written **/ | |||||
public int charsRead() | |||||
{ | |||||
return iIn; | |||||
} | |||||
} |
package com.healthmarketscience.jackcess.scsu; | |||||
/** | |||||
* This sample software accompanies Unicode Technical Report #6 and | |||||
* distributed as is by Unicode, Inc., subject to the following: | |||||
* | |||||
* Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. | |||||
* | |||||
* Permission to use, copy, modify, and distribute this software | |||||
* without fee is hereby granted provided that this copyright notice | |||||
* appears in all copies. | |||||
* | |||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE | |||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING | |||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, | |||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. | |||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND | |||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND | |||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING | |||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. | |||||
* | |||||
* @author Asmus Freytag | |||||
* | |||||
* @version 001 Dec 25 1996 | |||||
* @version 002 Jun 25 1997 | |||||
* @version 003 Jul 25 1997 | |||||
* | |||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc., | |||||
* and are registered in some jurisdictions. | |||||
**/ | |||||
/** | |||||
* The input string or input byte array ended prematurely | |||||
*/ | |||||
public class EndOfOutputException | |||||
extends java.lang.Exception | |||||
{ | |||||
private static final long serialVersionUID = 1L; | |||||
public EndOfOutputException(){ | |||||
super("The input string or input byte array ended prematurely"); | |||||
} | |||||
public EndOfOutputException(String s) { | |||||
super(s); | |||||
} | |||||
} |
/** reset is called to start with new input, w/o creating a new | /** reset is called to start with new input, w/o creating a new | ||||
instance */ | instance */ | ||||
@Override | |||||
public void reset() | public void reset() | ||||
{ | { | ||||
iOut = 0; | iOut = 0; |
package com.healthmarketscience.jackcess.scsu; | |||||
import java.io.*; | |||||
import java.util.*; | |||||
/** | |||||
* This sample software accompanies Unicode Technical Report #6 and | |||||
* distributed as is by Unicode, Inc., subject to the following: | |||||
* | |||||
* Copyright 1996-1998 Unicode, Inc.. All Rights Reserved. | |||||
* | |||||
* Permission to use, copy, modify, and distribute this software | |||||
* without fee is hereby granted provided that this copyright notice | |||||
* appears in all copies. | |||||
* | |||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE | |||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING | |||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, | |||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. | |||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND | |||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND | |||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING | |||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. | |||||
* | |||||
* @author Asmus Freytag | |||||
* | |||||
* @version 001 Dec 25 1996 | |||||
* @version 002 Jun 25 1997 | |||||
* @version 003 Jul 25 1997 | |||||
* @version 004 Aug 25 1997 | |||||
* @version 005 Sep 30 1998 | |||||
* | |||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc., | |||||
* and are registered in some jurisdictions. | |||||
**/ | |||||
/** | |||||
Class CompressMain | |||||
A small commandline driver interface for the compression routines | |||||
Use the /? to get usage | |||||
*/ | |||||
public class CompressMain | |||||
{ | |||||
static void usage() | |||||
{ | |||||
System.err.println("java CompressMain /? : this usage information\n"); | |||||
System.err.println("java CompressMain /random : random test\n"); | |||||
System.err.println("java CompressMain /suite : suite test\n"); | |||||
System.err.println("java CompressMain /suite <file> : file test (file data may include \\uXXXX)\n"); | |||||
System.err.println("java CompressMain <string> : string test (string may include \\uXXXX)\n"); | |||||
System.err.println("java CompressMain /roundtrip <file>: check Unicode file for roundtrip\n"); | |||||
System.err.println("java CompressMain /compress <file> : compresses Unicode files (no \\uXXXX)\n"); | |||||
System.err.println("java CompressMain /expand <file> : expands into Unicode files\n"); | |||||
System.err.println("java CompressMain /byteswap <files>: swaps byte order of Unicode files\n"); | |||||
System.err.println("java CompressMain /display <files> : like expand, but creates a dump instead\n"); | |||||
System.err.println("java CompressMain /parse <files> : parses \\uXXXX into binary Unicode\n"); | |||||
} | |||||
static void analyze(String text, int inlength, String result, int outlength) | |||||
{ | |||||
boolean fSuccess = text.equals(result); | |||||
Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED"); | |||||
if (!fSuccess && result != null) | |||||
{ | |||||
int iLim = Math.min(text.length(), result.length()); | |||||
for (int i = 0; i < iLim; i++) | |||||
{ | |||||
if (text.charAt(i) != result.charAt(i)) | |||||
{ | |||||
Debug.out("First Mismatch at "+ i +"=", result.charAt(i) ); | |||||
Debug.out("Original character "+ i +"=", text.charAt(i) ); | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes."); | |||||
Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%."); | |||||
} | |||||
} | |||||
static void test2(String text) | |||||
{ | |||||
byte bytes[] = null; | |||||
String result = null; | |||||
Debug.out("SCSU:\n"); | |||||
Compress compressor = new Compress(); | |||||
try | |||||
{ | |||||
bytes = compressor.compress(text); | |||||
Expand display = new Expand(); | |||||
result = display.expand(bytes); | |||||
Debug.out("Input: ", text.toCharArray()); | |||||
Debug.out("Result: ", result.toCharArray()); | |||||
Debug.out(""); | |||||
Expand expander = new Expand(); | |||||
result = expander.expand(bytes); | |||||
} | |||||
catch (Exception e) | |||||
{ | |||||
System.out.println(e); | |||||
} | |||||
int inlength = compressor.charsRead(); | |||||
int outlength = compressor.bytesWritten(); | |||||
analyze(text, inlength, result, outlength); | |||||
} | |||||
static void test(String text) throws Exception | |||||
{ | |||||
test(text, false); | |||||
} | |||||
static void test(String text, boolean shouldFail) | |||||
throws Exception | |||||
{ | |||||
// Create an instance of the compressor | |||||
Compress compressor = new Compress(); | |||||
byte [] bytes = null; | |||||
String result = null; | |||||
Exception failure = null; | |||||
try { | |||||
// perform compression | |||||
bytes = compressor.compress(text); | |||||
} | |||||
catch(Exception e) | |||||
{ | |||||
failure = e; | |||||
} | |||||
if(shouldFail) { | |||||
if(failure == null) { | |||||
throw new RuntimeException("Did not fail"); | |||||
} | |||||
return; | |||||
} | |||||
if(failure != null) { | |||||
throw failure; | |||||
} | |||||
Expand expander = new Expand(); | |||||
// perform expansion | |||||
result = expander.expand(bytes); | |||||
// analyze the results | |||||
int inlength = compressor.charsRead(); | |||||
int outlength = compressor.bytesWritten(); | |||||
analyze(text, inlength, result, outlength); | |||||
} | |||||
public static void display(byte [] input) | |||||
{ | |||||
try | |||||
{ | |||||
Expand expand = new Expand(); | |||||
String text = expand.expand(input); | |||||
Debug.out(text.toCharArray()); | |||||
} | |||||
catch (Exception e) | |||||
{ | |||||
System.out.println(e); | |||||
} | |||||
} | |||||
public static String parse(String input) | |||||
{ | |||||
StringTokenizer st = new StringTokenizer(input, "\\", true); | |||||
Debug.out("Input: ", input); | |||||
StringBuffer sb = new StringBuffer(); | |||||
while(st.hasMoreTokens()) | |||||
{ | |||||
String token = st.nextToken(); | |||||
Debug.out("Token: ", token); | |||||
if (token.charAt(0) == '\\' && token.length() == 1) | |||||
{ | |||||
if(st.hasMoreTokens()) | |||||
{ | |||||
token = st.nextToken(); | |||||
} | |||||
if(token.charAt(0) == 'u') | |||||
{ | |||||
Debug.out("Token: "+ token+ " ", sb.toString()); | |||||
String hexnum; | |||||
if (token.length() > 5) | |||||
{ | |||||
hexnum = token.substring(1,5); | |||||
token = token.substring(5); | |||||
} | |||||
else | |||||
{ | |||||
hexnum = token.substring(1); | |||||
token = ""; | |||||
} | |||||
sb.append((char)Integer.parseInt(hexnum, 16)); | |||||
} | |||||
} | |||||
sb.append(token); | |||||
} | |||||
return sb.toString(); | |||||
} | |||||
public static void randomTest(int nTest) | |||||
throws Exception | |||||
{ | |||||
Random random = new Random(); | |||||
for(int n=0; n < nTest; n++) | |||||
{ | |||||
int iLen = (int) (20 * random.nextFloat()); | |||||
StringBuffer sb = new StringBuffer(iLen); | |||||
for(int i = 0; i < iLen; i++) | |||||
{ | |||||
sb.append((char) (0xFFFF * random.nextFloat())); | |||||
} | |||||
test(sb.toString()); | |||||
} | |||||
} | |||||
@SuppressWarnings("deprecation") | |||||
public static void fileTest(String name) | |||||
throws Exception | |||||
{ | |||||
DataInputStream dis = new DataInputStream(new FileInputStream(name)); | |||||
int iLine = 0; | |||||
while(dis.available() != 0) | |||||
{ | |||||
String line = dis.readLine(); | |||||
Debug.out("Line "+ iLine++ +" "+line); | |||||
test(parse(line), false ); //false);// initially no debug info | |||||
} | |||||
} | |||||
public static void displayFile(String name) | |||||
throws IOException | |||||
{ | |||||
DataInputStream dis = new DataInputStream(new FileInputStream(name)); | |||||
byte bytes[] = new byte[dis.available()]; | |||||
dis.read(bytes); | |||||
display(bytes); | |||||
} | |||||
public static void decodeTest(String name) | |||||
throws IOException | |||||
{ | |||||
DataInputStream dis = new DataInputStream(new FileInputStream(name)); | |||||
byte bytes[] = new byte[dis.available()]; | |||||
dis.read(bytes); | |||||
Expand expand = new Expand(); | |||||
char [] chars = null; | |||||
try | |||||
{ | |||||
String text = expand.expand(bytes); | |||||
chars = text.toCharArray(); | |||||
} | |||||
catch (Exception e) | |||||
{ | |||||
System.out.println(e); | |||||
} | |||||
int inlength = expand.bytesRead(); | |||||
int iDot = name.lastIndexOf('.'); | |||||
StringBuffer sb = new StringBuffer(name); | |||||
sb.setLength(iDot + 1); | |||||
sb.append("txt"); | |||||
String outName = sb.toString(); | |||||
int outlength = expand.charsWritten(); | |||||
Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%."); | |||||
if (chars == null) | |||||
return; | |||||
writeUnicodeFile(outName, chars); | |||||
} | |||||
/** most of the next 3 functions should not be needed by JDK11 and later */ | |||||
private static int iMSB = 1; | |||||
public static String readUnicodeFile(String name) | |||||
{ | |||||
try | |||||
{ | |||||
FileInputStream dis = new FileInputStream(name); | |||||
byte b[] = new byte[2]; | |||||
StringBuffer sb = new StringBuffer(); | |||||
char ch = 0; | |||||
iMSB = 1; | |||||
int i = 0; | |||||
for(i = 0; (dis.available() != 0); i++) | |||||
{ | |||||
b[i%2] = (byte) dis.read(); | |||||
if ((i & 1) == 1) | |||||
{ | |||||
ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]); | |||||
} | |||||
else | |||||
{ | |||||
continue; | |||||
} | |||||
if (i == 1 && ch == '\uFEFF') | |||||
continue; // throw away byte order mark | |||||
if (i == 1 && ch == '\uFFFE') | |||||
{ | |||||
iMSB ++; // flip byte order | |||||
continue; // throw away byte order mark | |||||
} | |||||
sb.append(ch); | |||||
} | |||||
return sb.toString(); | |||||
} | |||||
catch (IOException e) | |||||
{ | |||||
System.err.println(e); | |||||
return ""; | |||||
} | |||||
} | |||||
public static void writeUnicodeFile(String outName, char [] chars) | |||||
throws IOException | |||||
{ | |||||
DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName)); | |||||
if ((iMSB & 1) == 1) | |||||
{ | |||||
dos.writeByte(0xFF); | |||||
dos.writeByte(0xFE); | |||||
} | |||||
else | |||||
{ | |||||
dos.writeByte(0xFE); | |||||
dos.writeByte(0xFF); | |||||
} | |||||
byte b[] = new byte[2]; | |||||
for (int ich = 0; ich < chars.length; ich++) | |||||
{ | |||||
b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8); | |||||
b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF); | |||||
dos.write(b, 0, 2); | |||||
} | |||||
} | |||||
static void byteswap(String name) | |||||
throws IOException | |||||
{ | |||||
String text = readUnicodeFile(name); | |||||
char chars[] = text.toCharArray(); | |||||
writeUnicodeFile(name, chars); | |||||
} | |||||
@SuppressWarnings("deprecation") | |||||
public static void parseFile(String name) | |||||
throws IOException | |||||
{ | |||||
DataInputStream dis = new DataInputStream(new FileInputStream(name)); | |||||
byte bytes[] = new byte[dis.available()]; | |||||
dis.read(bytes); | |||||
// simplistic test | |||||
int bom = (char) bytes[0] + (char) bytes[1]; | |||||
if (bom == 131069) | |||||
{ | |||||
// FEFF or FFFE detected (either one sums to 131069) | |||||
Debug.out(name + " is already in Unicode!"); | |||||
return; | |||||
} | |||||
// definitely assumes an ASCII file at this point | |||||
String text = new String(bytes, 0); | |||||
char chars[] = parse(text).toCharArray(); | |||||
writeUnicodeFile(name, chars); | |||||
return; | |||||
} | |||||
public static void encodeTest(String name) | |||||
throws Exception | |||||
{ | |||||
String text = readUnicodeFile(name); | |||||
// Create an instance of the compressor | |||||
Compress compressor = new Compress(); | |||||
byte [] bytes = null; | |||||
// perform compression | |||||
bytes = compressor.compress(text); | |||||
int inlength = compressor.charsRead(); | |||||
int iDot = name.lastIndexOf('.'); | |||||
StringBuffer sb = new StringBuffer(name); | |||||
sb.setLength(iDot + 1); | |||||
sb.append("csu"); | |||||
String outName = sb.toString(); | |||||
DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName)); | |||||
dos.write(bytes, 0, bytes.length); | |||||
int outlength = compressor.bytesWritten(); | |||||
Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%."); | |||||
} | |||||
public static void roundtripTest(String name) | |||||
throws Exception | |||||
{ | |||||
test(readUnicodeFile(name), false);// no debug info | |||||
} | |||||
/** The Main function */ | |||||
public static void main(String args[]) | |||||
throws Exception | |||||
{ | |||||
int iArg = args.length; | |||||
try | |||||
{ | |||||
if (iArg != 0) | |||||
{ | |||||
if (args[0].equalsIgnoreCase("/compress")) | |||||
{ | |||||
while (--iArg > 0) | |||||
{ | |||||
encodeTest(args[args.length - iArg]); | |||||
} | |||||
} | |||||
else if (args[0].equalsIgnoreCase("/parse")) | |||||
{ | |||||
while (--iArg > 0) | |||||
{ | |||||
parseFile(args[args.length - iArg]); | |||||
} | |||||
} | |||||
else if (args[0].equalsIgnoreCase("/expand")) | |||||
{ | |||||
while (--iArg > 0) | |||||
{ | |||||
decodeTest(args[args.length - iArg]); | |||||
} | |||||
} | |||||
else if (args[0].equalsIgnoreCase("/display")) | |||||
{ | |||||
while (--iArg > 0) | |||||
{ | |||||
displayFile(args[args.length - iArg]); | |||||
} | |||||
} | |||||
else if (args[0].equalsIgnoreCase("/roundtrip")) | |||||
{ | |||||
while (--iArg > 0) | |||||
{ | |||||
roundtripTest(args[args.length - iArg]); | |||||
} | |||||
} | |||||
else if (args[0].equalsIgnoreCase("/byteswap")) | |||||
{ | |||||
while (--iArg > 0) | |||||
{ | |||||
byteswap(args[args.length - iArg]); | |||||
} | |||||
}else if (args[0].equalsIgnoreCase("/random")) | |||||
{ | |||||
randomTest(8); | |||||
} | |||||
else if (args[0].equalsIgnoreCase("/suite")) | |||||
{ | |||||
if (iArg == 1) | |||||
{ | |||||
suiteTest(); | |||||
} | |||||
else | |||||
{ | |||||
while (--iArg > 0) | |||||
{ | |||||
fileTest(args[args.length - iArg]); | |||||
} | |||||
} | |||||
} | |||||
else if (args[0].equalsIgnoreCase("/?")) | |||||
{ | |||||
usage(); | |||||
} | |||||
else | |||||
{ | |||||
while (iArg > 0) | |||||
{ | |||||
test2(parse(args[--iArg])); | |||||
} | |||||
} | |||||
} | |||||
else | |||||
{ | |||||
usage(); | |||||
} | |||||
} | |||||
catch (IOException e) | |||||
{ | |||||
System.err.println(e); | |||||
} | |||||
try | |||||
{ | |||||
System.err.println("Done. Press enter to exit"); | |||||
System.in.read(); | |||||
} | |||||
catch (IOException e) | |||||
{ | |||||
} | |||||
} | |||||
static void suiteTest() | |||||
throws Exception | |||||
{ | |||||
Debug.out("Standard Compression test suite:"); | |||||
test("Hello \u9292 \u9192 World!"); | |||||
test("Hell\u0429o \u9292 \u9192 W\u00e4rld!"); | |||||
test("Hell\u0429o \u9292 \u9292W\u00e4rld!"); | |||||
test("\u0648\u06c8"); // catch missing reset | |||||
test("\u0648\u06c8"); | |||||
test("\u4444\uE001"); // lowest quotable | |||||
test("\u4444\uf2FF"); // highest quotable | |||||
test("\u4444\uf188\u4444"); | |||||
test("\u4444\uf188\uf288"); | |||||
test("\u4444\uf188abc\0429\uf288"); | |||||
test("\u9292\u2222"); | |||||
test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!"); | |||||
test("Hell\u0429o \u9292 \u9292W\u00e4rld!"); | |||||
test("Hello World!123456"); | |||||
test("Hello W\u0081\u011f\u0082!"); // Latin 1 run | |||||
test("abc\u0301\u0302"); // uses SQn for u301 u302 | |||||
test("abc\u4411d"); // uses SQU | |||||
test("abc\u4411\u4412d");// uses SCU | |||||
test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5 | |||||
test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data | |||||
test("\u9292\u2222"); | |||||
test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); | |||||
test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c"); | |||||
test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002"); | |||||
test(""); // empty input | |||||
test("\u0000"); // smallest BMP character | |||||
test("\uFFFF"); // largest BMP character | |||||
test("\ud800\udc00"); // smallest surrogate | |||||
test("\ud8ff\udcff"); // largest surrogate pair | |||||
Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:"); | |||||
test("\ud800 \udc00", true); // unpaired surrogate (1) | |||||
test("\udc00", true); // unpaired surrogate (2) | |||||
test("\ud800", true); // unpaired surrogate (3) | |||||
} | |||||
} |
/* | |||||
Copyright (c) 2007 Health Market Science, Inc. | |||||
This library is free software; you can redistribute it and/or | |||||
modify it under the terms of the GNU Lesser General Public | |||||
License as published by the Free Software Foundation; either | |||||
version 2.1 of the License, or (at your option) any later version. | |||||
This library is distributed in the hope that it will be useful, | |||||
but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
Lesser General Public License for more details. | |||||
You should have received a copy of the GNU Lesser General Public | |||||
License along with this library; if not, write to the Free Software | |||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |||||
USA | |||||
You can contact Health Market Science at info@healthmarketscience.com | |||||
or at the following address: | |||||
Health Market Science | |||||
2700 Horizon Drive | |||||
Suite 200 | |||||
King of Prussia, PA 19406 | |||||
*/ | |||||
package com.healthmarketscience.jackcess.scsu; | |||||
import junit.framework.TestCase; | |||||
/** | |||||
* @author James Ahlborn | |||||
*/ | |||||
public class CompressTest extends TestCase | |||||
{ | |||||
public CompressTest(String name) throws Exception { | |||||
super(name); | |||||
} | |||||
public void testCompression() throws Exception | |||||
{ | |||||
CompressMain.suiteTest(); | |||||
} | |||||
} |