浏览代码

Add compression code for possible future use; add compression unit

        tests.

git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@365 f203690c-595d-4dc9-a70b-905162fa7fd2
tags/rel_1_1_16
James Ahlborn 16 年前
父节点
当前提交
4af4fe4451

+ 4
- 0
src/changes/changes.xml 查看文件

@@ -25,6 +25,10 @@
<action dev="jahlborn" type="add">
Add primitive support for writing unicode compressed text columns.
</action>
<action dev="jahlborn" type="add">
Add compression code for possible future use; add compression unit
tests.
</action>
</release>
<release version="1.1.15" date="2008-06-27">
<action dev="jahlborn" type="fix" issue="1998225">

+ 2
- 11
src/java/com/healthmarketscience/jackcess/Column.java 查看文件

@@ -41,6 +41,7 @@ import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.healthmarketscience.jackcess.scsu.Compress;
import com.healthmarketscience.jackcess.scsu.EndOfInputException;
import com.healthmarketscience.jackcess.scsu.Expand;
import com.healthmarketscience.jackcess.scsu.IllegalInputException;
@@ -1196,23 +1197,13 @@ public class Column implements Comparable<Column> {
// now, see if it is all printable ASCII
for(int i = 0; i < text.length(); ++i) {
char c = text.charAt(i);
if(!isAsciiCrLfOrTab(c)) {
if(!Compress.isAsciiCrLfOrTab(c)) {
return false;
}
}
return true;
}

/**
* Returns true if the character is ASCII, but not a control other than
* CR, LF and TAB
*/
private static boolean isAsciiCrLfOrTab(int ch)
{
return ((ch >= 0x20 && ch <= 0x7F) // ASCII (non control)
|| ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB
}

@Override
public String toString() {
StringBuilder rtn = new StringBuilder();

+ 628
- 0
src/java/com/healthmarketscience/jackcess/scsu/Compress.java 查看文件

@@ -0,0 +1,628 @@
package com.healthmarketscience.jackcess.scsu;

/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/

/**
This class implements a simple compression algorithm
**/
/*
Note on exception handling
This compressor is designed so that it can be restarted after
an exception. All operations advancing input and/or output cursor
(iIn and iOut) either complete an action, or set a state (fUnicodeMode)
before updating the cursors.
*/
public class Compress extends SCSU
{

/** next input character to be read **/
private int iIn;

/** next output byte to be written **/
private int iOut;

/** start index of Unicode mode in output array, or -1 if in single byte mode **/
private int iSCU = -1;

/** true if the next command byte is of the Uxx family */
private boolean fUnicodeMode = false;

/** locate a window for a character given a table of offsets
@param ch - character
@param offsetTable - table of window offsets
@return true if the character fits a window from the table of windows */
private boolean locateWindow(int ch, int[] offsetTable)
{
// always try the current window first
int iWin = getCurrentWindow();

// if the character fits the current window
// just use the current window
if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
{
return true;
}

// try all windows in order
for (iWin = 0; iWin < offsetTable.length; iWin++)
{
if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
{
selectWindow(iWin);
return true;
}
}
// none found
return false;
}

/** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
public static boolean isAsciiCrLfOrTab(int ch)
{
return (ch >= 0x20 && ch <= 0x7F) // ASCII
|| ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB

}

/** output a run of characters in single byte mode
In single byte mode pass through characters in the ASCII range, but
quote characters overlapping with compression command codes. Runs
of characters fitting the current window are output as runs of bytes
in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
Uses and updates the current input and output cursors store in
the instance variables <i>iIn</i> and <i>iOut</i>.
@param in - input character array
@param out - output byte array
@return the next chaacter to be processed. This may be an extended character.
**/
@SuppressWarnings("fallthrough")
public int outputSingleByteRun(char [] in, byte [] out)
throws EndOfOutputException, EndOfInputException, IllegalInputException
{
int iWin = getCurrentWindow();
while(iIn < in.length)
{
int outlen = 0;
byte byte1 = 0;
byte byte2 = 0;

// get the input character
int ch = in[iIn];

int inlen = 1;

// Check input for Surrogate pair
if ( (ch & 0xF800) == 0xD800 )
{
if ( (ch & 0xFC00) == 0xDC00 )
{
// low surrogate out of order
throw new IllegalInputException("Unpaired low surrogate: "+iIn);
}
else
{
// have high surrogate now get low surrogate
if ( iIn >= in.length-1)
{
// premature end of input
throw new EndOfInputException();
}
// get the char
int ch2 = in[iIn+1];

// make sure it's a low surrogate
if ( (ch2 & 0xFC00) != 0xDC00 )
{
// a low surrogate was required
throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1));
}

// combine the two values
ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
// ch = ch<<10 + ch2 - 0x36F0000;

inlen = 2;
}
}

// ASCII Letter, NUL, CR, LF and TAB are always passed through
if (isAsciiCrLfOrTab(ch) || ch == 0)
{
// pass through directcly
byte2 = (byte)(ch & 0x7F);
outlen = 1;
}

// All other control codes must be quoted
else if (ch < 0x20)
{
byte1 = SQ0;
byte2 = (byte)(ch);
outlen = 2;
}

// Letters that fit the current dynamic window
else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
{
ch -= dynamicOffset[iWin];
byte2 = (byte)(ch | 0x80);
outlen = 1;
}

// check for room in the output array
if (iOut + outlen >= out.length)
{
throw new EndOfOutputException();
}

switch(outlen)
{
default:
// need to use some other compression mode for this
// character so we terminate this loop

return ch; // input not finished

// output the characters
case 2:
out[iOut++] = byte1;
// fall through
case 1:
out[iOut++] = byte2;
break;
}
// advance input pointer
iIn += inlen;
}
return 0; // input all used up
}

/** quote a single character in single byte mode
Quoting a character (aka 'non-locking shift') gives efficient access
to characters that occur in isolation--usually punctuation characters.
When quoting a character from a dynamic window use 0x80 - 0xFF, when
quoting a character from a static window use 0x00-0x7f.
@param ch - character to be quoted
@param out - output byte array
**/

private void quoteSingleByte(int ch, byte [] out)
throws EndOfOutputException
{
Debug.out("Quoting SingleByte ", ch);
int iWin = getCurrentWindow();

// check for room in the output array
if (iOut >= out.length -2)
{
throw new EndOfOutputException();
}

// Output command byte followed by
out[iOut++] = (byte)(SQ0 + iWin);

// Letter that fits the current dynamic window
if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
{
ch -= dynamicOffset[iWin];
out[iOut++] = (byte)(ch | 0x80);
}

// Letter that fits the current static window
else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80)
{
ch -= staticOffset[iWin];
out[iOut++] = (byte)ch;
}
else
{
throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error");
}
// advance input pointer
iIn ++;
Debug.out("New input: ", iIn);
}

/** output a run of characters in Unicode mode
A run of Unicode mode consists of characters which are all in the
range of non-compressible characters or isolated occurrence
of any other characters. Characters in the range 0xE00-0xF2FF must
be quoted to avoid overlap with the Unicode mode compression command codes.
Uses and updates the current input and output cursors store in
the instance variables <i>iIn</i> and <i>iOut</i>.
NOTE: Characters from surrogate pairs are passed through and unlike single
byte mode no checks are made for unpaired surrogate characters.
@param in - input character array
@param out - output byte array
@return the next input character to be processed
**/
public char outputUnicodeRun(char [] in, byte [] out)
throws EndOfOutputException
{
// current character
char ch = 0;

while(iIn < in.length)
{
// get current input and set default output length
ch = in[iIn];
int outlen = 2;

// Characters in these ranges could potentially be compressed.
// We require 2 or more compressible characters to break the run
if (isCompressible(ch))
{
// check whether we can look ahead
if( iIn < in.length - 1)
{
// DEBUG
Debug.out("is-comp: ",ch);
char ch2 = in[iIn + 1];
if (isCompressible(ch2))
{
// at least 2 characters are compressible
// break the run
break;
}
//DEBUG
Debug.out("no-comp: ",ch2);
}
// If we get here, the current character is only character
// left in the input or it is followed by a non-compressible
// character. In neither case do we gain by breaking the
// run, so we proceed to output the character.
if (ch >= 0xE000 && ch <= 0xF2FF)
{
// Characters in this range need to be escaped
outlen = 3;
}

}
// check that there is enough room to output the character
if(iOut >= out.length - outlen)
{
// DEBUG
Debug.out("End of Output @", iOut);
// if we got here, we ran out of space in the output array
throw new EndOfOutputException();
}

// output any characters that cannot be compressed,
if (outlen == 3)
{
// output the quote character
out[iOut++] = UQU;
}
// pass the Unicode character in MSB,LSB order
out[iOut++] = (byte)(ch >>> 8);
out[iOut++] = (byte)(ch & 0xFF);

// advance input cursor
iIn++;
}

// return the last character
return ch;
}

static int iNextWindow = 3;

/** redefine a window so it surrounds a given character value
For now, this function uses window 3 exclusively (window 4
for extended windows);
@return true if a window was successfully defined
@param ch - character around which window is positioned
@param out - output byte array
@param fCurUnicodeMode - type of window
**/
private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode)
throws IllegalInputException, EndOfOutputException
{
int iWin = iNextWindow % 8; // simple LRU
int iPosition = 0;

// iPosition 0 is a reserved value
if (ch < 0x80)
{
throw new IllegalStateException("ch < 0x80");
//return false;
}

// Check the fixed offsets
for (int i = 0; i < fixedOffset.length; i++)
{
if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80)
{
iPosition = i;
break;
}
}

if (iPosition != 0)
{
// DEBUG
Debug.out("FIXED position is ", iPosition + 0xF9);

// ch fits in a fixed offset window position
dynamicOffset[iWin] = fixedOffset[iPosition];
iPosition += 0xF9;
}
else if (ch < 0x3400)
{
// calculate a window position command and set the offset
iPosition = ch >>> 7;
dynamicOffset[iWin] = ch & 0xFF80;

Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
}
else if (ch < 0xE000)
{
// attempt to place a window where none can go
return false;
}
else if (ch <= 0xFFFF)
{
// calculate a window position command, accounting
// for the gap in position values, and set the offset
iPosition = ((ch - gapOffset)>>> 7);

dynamicOffset[iWin] = ch & 0xFF80;

Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
}
else
{
// if we get here, the character is in the extended range.
// Always use Window 4 to define an extended window

iPosition = (ch - 0x10000) >>> 7;
// DEBUG
Debug.out("Try position Window at ", iPosition);

iPosition |= iWin << 13;
dynamicOffset[iWin] = ch & 0x1FFF80;
}

// Outputting window defintion command for the general cases
if ( iPosition < 0x100 && iOut < out.length-1)
{
out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin);
out[iOut++] = (byte) (iPosition & 0xFF);
}
// Output an extended window definiton command
else if ( iPosition >= 0x100 && iOut < out.length - 2)
{

Debug.out("Setting extended window at ", iPosition);
out[iOut++] = (fCurUnicodeMode ? UDX : SDX);
out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
out[iOut++] = (byte) (iPosition & 0xFF);
}
else
{
throw new EndOfOutputException();
}
selectWindow(iWin);
iNextWindow++;
return true;
}

/**
compress a Unicode character array with some simplifying assumptions
**/
public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut)
throws IllegalInputException, EndOfInputException, EndOfOutputException
{
iIn = iStartIn;
iOut = iStartOut;


while (iIn < in.length)
{
int ch;

// previously we switched to a Unicode run
if (iSCU != -1)
{

Debug.out("Remaining", in, iIn);
Debug.out("Output until ["+iOut+"]: ", out);

// output characters as Unicode
ch = outputUnicodeRun(in, out);

// for single character Unicode runs (3 bytes) use quote
if (iOut - iSCU == 3 )
{
// go back and fix up the SCU to an SQU instead
out[iSCU] = SQU;
iSCU = -1;
continue;
}
else
{
iSCU = -1;
fUnicodeMode = true;
}
}
// next, try to output characters as single byte run
else
{
ch = outputSingleByteRun(in, out);
}

// check whether we still have input
if (iIn == in.length)
{
break; // no more input
}

// if we get here, we have a consistent value for ch, whether or
// not it is an regular or extended character. Locate or define a
// Window for the current character

Debug.out("Output so far: ", out);
Debug.out("Routing ch="+ch+" for Input", in, iIn);

// Check that we have enough room to output the command byte
if (iOut >= out.length - 1)
{
throw new EndOfOutputException();
}

// In order to switch away from Unicode mode, it is necessary
// to select (or define) a window. If the characters that follow
// the Unicode range are ASCII characters, we can't use them
// to decide which window to select, since ASCII characters don't
// influence window settings. This loop looks ahead until it finds
// one compressible character that isn't in the ASCII range.
for (int ich = iIn; ch < 0x80; ich++)
{
if (ich == in.length || !isCompressible(in[ich]))
{
// if there are only ASCII characters left,
ch = in[iIn];
break;
}
ch = in[ich]; // lookahead for next non-ASCII char
}
// The character value contained in ch here will only be used to select
// output modes. Actual output of characters starts with in[iIn] and
// only takes place near the top of the loop.

int iprevWindow = getCurrentWindow();

// try to locate a dynamic window
if (ch < 0x80 || locateWindow(ch, dynamicOffset))
{
Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1);
// lookahead to use SQn instead of SCn for single
// character interruptions of runs in current window
if(!fUnicodeMode && iIn < in.length -1)
{
char ch2 = in[iIn+1];
if (ch2 >= dynamicOffset[iprevWindow] &&
ch2 < dynamicOffset[iprevWindow] + 0x80)
{
quoteSingleByte(ch, out);
selectWindow(iprevWindow);
continue;
}
}

out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
fUnicodeMode = false;
}
// try to locate a static window
else if (!fUnicodeMode && locateWindow(ch, staticOffset))
{
// static windows are not accessible from Unicode mode
Debug.out("located a static window", getCurrentWindow());
quoteSingleByte(ch, out);
selectWindow(iprevWindow); // restore current Window settings
continue;
}
// try to define a window around ch
else if (positionWindow(ch, out, fUnicodeMode) )
{
fUnicodeMode = false;
}
// If all else fails, start a Unicode run
else
{
iSCU = iOut;
out[iOut++] = SCU;
continue;
}
}

return iOut - iStartOut;
}

public byte[] compress(String inStr)
throws IllegalInputException, EndOfInputException
{
// Running out of room for output can cause non-optimal
// compression. In order to not slow down compression too
// much, not all intermediate state is constantly saved.

byte [] out = new byte[inStr.length() * 2];
char [] in = inStr.toCharArray();
//DEBUG
Debug.out("compress input: ",in);
reset();
while(true)
{
try
{
simpleCompress(in, charsRead(), out, bytesWritten());
// if we get here things went fine.
break;
}
catch (EndOfOutputException e)
{
// create a larger output buffer and continue
byte [] largerOut = new byte[out.length * 2];
System.arraycopy(out, 0, largerOut, 0, out.length);
out = largerOut;
}
}
byte [] trimmedOut = new byte[bytesWritten()];
System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
out = trimmedOut;

Debug.out("compress output: ", out);
return out;
}

/** reset is only needed to bail out after an exception and
restart with new input */
@Override
public void reset()
{
super.reset();
fUnicodeMode = false;
iSCU = - 1;
}

/** returns the number of bytes written **/
public int bytesWritten()
{
return iOut;
}

/** returns the number of bytes written **/
public int charsRead()
{
return iIn;
}

}

+ 48
- 0
src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java 查看文件

@@ -0,0 +1,48 @@
package com.healthmarketscience.jackcess.scsu;

/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
* The input string or input byte array ended prematurely
*/
public class EndOfOutputException
extends java.lang.Exception

{

private static final long serialVersionUID = 1L;
public EndOfOutputException(){
super("The input string or input byte array ended prematurely");
}

public EndOfOutputException(String s) {
super(s);
}
}

+ 1
- 0
src/java/com/healthmarketscience/jackcess/scsu/Expand.java 查看文件

@@ -411,6 +411,7 @@ public class Expand extends SCSU

/** reset is called to start with new input, w/o creating a new
instance */
@Override
public void reset()
{
iOut = 0;

+ 574
- 0
test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java 查看文件

@@ -0,0 +1,574 @@
package com.healthmarketscience.jackcess.scsu;

import java.io.*;
import java.util.*;

/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
* @version 005 Sep 30 1998
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/

/**
Class CompressMain

A small commandline driver interface for the compression routines
Use the /? to get usage
*/
public class CompressMain
{
static void usage()
{
System.err.println("java CompressMain /? : this usage information\n");
System.err.println("java CompressMain /random : random test\n");
System.err.println("java CompressMain /suite : suite test\n");
System.err.println("java CompressMain /suite <file> : file test (file data may include \\uXXXX)\n");
System.err.println("java CompressMain <string> : string test (string may include \\uXXXX)\n");
System.err.println("java CompressMain /roundtrip <file>: check Unicode file for roundtrip\n");
System.err.println("java CompressMain /compress <file> : compresses Unicode files (no \\uXXXX)\n");
System.err.println("java CompressMain /expand <file> : expands into Unicode files\n");
System.err.println("java CompressMain /byteswap <files>: swaps byte order of Unicode files\n");
System.err.println("java CompressMain /display <files> : like expand, but creates a dump instead\n");
System.err.println("java CompressMain /parse <files> : parses \\uXXXX into binary Unicode\n");
}

static void analyze(String text, int inlength, String result, int outlength)
{
boolean fSuccess = text.equals(result);
Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED");
if (!fSuccess && result != null)
{
int iLim = Math.min(text.length(), result.length());
for (int i = 0; i < iLim; i++)
{
if (text.charAt(i) != result.charAt(i))
{
Debug.out("First Mismatch at "+ i +"=", result.charAt(i) );
Debug.out("Original character "+ i +"=", text.charAt(i) );
break;
}
}
}
else
{
Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes.");
Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
}
}

static void test2(String text)
{
byte bytes[] = null;
String result = null;
Debug.out("SCSU:\n");
Compress compressor = new Compress();
try
{
bytes = compressor.compress(text);
Expand display = new Expand();
result = display.expand(bytes);
Debug.out("Input: ", text.toCharArray());
Debug.out("Result: ", result.toCharArray());
Debug.out("");
Expand expander = new Expand();
result = expander.expand(bytes);
}
catch (Exception e)
{
System.out.println(e);
}
int inlength = compressor.charsRead();
int outlength = compressor.bytesWritten();
analyze(text, inlength, result, outlength);
}

static void test(String text) throws Exception
{
test(text, false);
}

static void test(String text, boolean shouldFail)
throws Exception
{
// Create an instance of the compressor
Compress compressor = new Compress();

byte [] bytes = null;
String result = null;
Exception failure = null;
try {
// perform compression
bytes = compressor.compress(text);
}
catch(Exception e)
{
failure = e;
}

if(shouldFail) {
if(failure == null) {
throw new RuntimeException("Did not fail");
}
return;
}

if(failure != null) {
throw failure;
}

Expand expander = new Expand();
// perform expansion
result = expander.expand(bytes);

// analyze the results
int inlength = compressor.charsRead();
int outlength = compressor.bytesWritten();
analyze(text, inlength, result, outlength);

}

public static void display(byte [] input)
{
try
{
Expand expand = new Expand();
String text = expand.expand(input);
Debug.out(text.toCharArray());
}
catch (Exception e)
{
System.out.println(e);
}
}

public static String parse(String input)
{
StringTokenizer st = new StringTokenizer(input, "\\", true);
Debug.out("Input: ", input);

StringBuffer sb = new StringBuffer();

while(st.hasMoreTokens())
{
String token = st.nextToken();
Debug.out("Token: ", token);
if (token.charAt(0) == '\\' && token.length() == 1)
{
if(st.hasMoreTokens())
{
token = st.nextToken();
}
if(token.charAt(0) == 'u')
{
Debug.out("Token: "+ token+ " ", sb.toString());
String hexnum;
if (token.length() > 5)
{
hexnum = token.substring(1,5);
token = token.substring(5);
}
else
{
hexnum = token.substring(1);
token = "";
}
sb.append((char)Integer.parseInt(hexnum, 16));
}
}
sb.append(token);
}
return sb.toString();
}

public static void randomTest(int nTest)
throws Exception
{
Random random = new Random();

for(int n=0; n < nTest; n++)
{
int iLen = (int) (20 * random.nextFloat());
StringBuffer sb = new StringBuffer(iLen);

for(int i = 0; i < iLen; i++)
{
sb.append((char) (0xFFFF * random.nextFloat()));
}

test(sb.toString());
}
}

@SuppressWarnings("deprecation")
public static void fileTest(String name)
throws Exception
{
DataInputStream dis = new DataInputStream(new FileInputStream(name));

int iLine = 0;

while(dis.available() != 0)
{
String line = dis.readLine();
Debug.out("Line "+ iLine++ +" "+line);
test(parse(line), false ); //false);// initially no debug info
}
}

public static void displayFile(String name)
throws IOException
{
DataInputStream dis = new DataInputStream(new FileInputStream(name));

byte bytes[] = new byte[dis.available()];
dis.read(bytes);
display(bytes);
}

public static void decodeTest(String name)
throws IOException
{
DataInputStream dis = new DataInputStream(new FileInputStream(name));

byte bytes[] = new byte[dis.available()];
dis.read(bytes);

Expand expand = new Expand();

char [] chars = null;
try
{
String text = expand.expand(bytes);
chars = text.toCharArray();
}
catch (Exception e)
{
System.out.println(e);
}
int inlength = expand.bytesRead();
int iDot = name.lastIndexOf('.');
StringBuffer sb = new StringBuffer(name);
sb.setLength(iDot + 1);
sb.append("txt");
String outName = sb.toString();

int outlength = expand.charsWritten();

Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%.");

if (chars == null)
return;

writeUnicodeFile(outName, chars);
}

/** most of the next 3 functions should not be needed by JDK11 and later */
private static int iMSB = 1;

public static String readUnicodeFile(String name)
{
try
{
FileInputStream dis = new FileInputStream(name);

byte b[] = new byte[2];
StringBuffer sb = new StringBuffer();
char ch = 0;

iMSB = 1;
int i = 0;
for(i = 0; (dis.available() != 0); i++)
{
b[i%2] = (byte) dis.read();

if ((i & 1) == 1)
{
ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]);
}
else
{
continue;
}
if (i == 1 && ch == '\uFEFF')
continue; // throw away byte order mark

if (i == 1 && ch == '\uFFFE')
{
iMSB ++; // flip byte order
continue; // throw away byte order mark
}
sb.append(ch);
}

return sb.toString();
}
catch (IOException e)
{
System.err.println(e);
return "";
}
}

public static void writeUnicodeFile(String outName, char [] chars)
throws IOException
{
DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
if ((iMSB & 1) == 1)
{
dos.writeByte(0xFF);
dos.writeByte(0xFE);
}
else
{
dos.writeByte(0xFE);
dos.writeByte(0xFF);
}
byte b[] = new byte[2];
for (int ich = 0; ich < chars.length; ich++)
{
b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8);
b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF);
dos.write(b, 0, 2);
}
}

static void byteswap(String name)
throws IOException
{
String text = readUnicodeFile(name);
char chars[] = text.toCharArray();
writeUnicodeFile(name, chars);
}

@SuppressWarnings("deprecation")
public static void parseFile(String name)
throws IOException
{
DataInputStream dis = new DataInputStream(new FileInputStream(name));

byte bytes[] = new byte[dis.available()];
dis.read(bytes);

// simplistic test
int bom = (char) bytes[0] + (char) bytes[1];
if (bom == 131069)
{
// FEFF or FFFE detected (either one sums to 131069)
Debug.out(name + " is already in Unicode!");
return;
}

// definitely assumes an ASCII file at this point
String text = new String(bytes, 0);

char chars[] = parse(text).toCharArray();
writeUnicodeFile(name, chars);
return;
}

public static void encodeTest(String name)
throws Exception
{
String text = readUnicodeFile(name);

// Create an instance of the compressor
Compress compressor = new Compress();

byte [] bytes = null;

// perform compression
bytes = compressor.compress(text);

int inlength = compressor.charsRead();
int iDot = name.lastIndexOf('.');
StringBuffer sb = new StringBuffer(name);
sb.setLength(iDot + 1);
sb.append("csu");
String outName = sb.toString();

DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
dos.write(bytes, 0, bytes.length);

int outlength = compressor.bytesWritten();

Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
}

public static void roundtripTest(String name)
throws Exception
{
test(readUnicodeFile(name), false);// no debug info
}

/** The Main function */
public static void main(String args[])
throws Exception
{
int iArg = args.length;

try
{
if (iArg != 0)
{
if (args[0].equalsIgnoreCase("/compress"))
{
while (--iArg > 0)
{
encodeTest(args[args.length - iArg]);
}
}
else if (args[0].equalsIgnoreCase("/parse"))
{
while (--iArg > 0)
{
parseFile(args[args.length - iArg]);
}
}
else if (args[0].equalsIgnoreCase("/expand"))
{
while (--iArg > 0)
{
decodeTest(args[args.length - iArg]);
}
}
else if (args[0].equalsIgnoreCase("/display"))
{
while (--iArg > 0)
{
displayFile(args[args.length - iArg]);
}
}
else if (args[0].equalsIgnoreCase("/roundtrip"))
{
while (--iArg > 0)
{
roundtripTest(args[args.length - iArg]);
}
}
else if (args[0].equalsIgnoreCase("/byteswap"))
{
while (--iArg > 0)
{
byteswap(args[args.length - iArg]);
}
}else if (args[0].equalsIgnoreCase("/random"))
{
randomTest(8);
}
else if (args[0].equalsIgnoreCase("/suite"))
{
if (iArg == 1)
{
suiteTest();
}
else
{
while (--iArg > 0)
{
fileTest(args[args.length - iArg]);
}
}
}
else if (args[0].equalsIgnoreCase("/?"))
{
usage();
}
else
{
while (iArg > 0)
{
test2(parse(args[--iArg]));
}
}
}
else
{
usage();
}
}
catch (IOException e)
{
System.err.println(e);
}
try
{
System.err.println("Done. Press enter to exit");
System.in.read();
}
catch (IOException e)
{

}
}

static void suiteTest()
throws Exception
{
Debug.out("Standard Compression test suite:");
test("Hello \u9292 \u9192 World!");
test("Hell\u0429o \u9292 \u9192 W\u00e4rld!");
test("Hell\u0429o \u9292 \u9292W\u00e4rld!");

test("\u0648\u06c8"); // catch missing reset
test("\u0648\u06c8");

test("\u4444\uE001"); // lowest quotable
test("\u4444\uf2FF"); // highest quotable
test("\u4444\uf188\u4444");
test("\u4444\uf188\uf288");
test("\u4444\uf188abc\0429\uf288");
test("\u9292\u2222");
test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!");
test("Hell\u0429o \u9292 \u9292W\u00e4rld!");
test("Hello World!123456");
test("Hello W\u0081\u011f\u0082!"); // Latin 1 run

test("abc\u0301\u0302"); // uses SQn for u301 u302
test("abc\u4411d"); // uses SQU
test("abc\u4411\u4412d");// uses SCU
test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5
test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data
test("\u9292\u2222");
test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000");
test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c");
test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002");

test(""); // empty input
test("\u0000"); // smallest BMP character
test("\uFFFF"); // largest BMP character

test("\ud800\udc00"); // smallest surrogate
test("\ud8ff\udcff"); // largest surrogate pair


Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:");
test("\ud800 \udc00", true); // unpaired surrogate (1)
test("\udc00", true); // unpaired surrogate (2)
test("\ud800", true); // unpaired surrogate (3)
}
}

+ 47
- 0
test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java 查看文件

@@ -0,0 +1,47 @@
/*
Copyright (c) 2007 Health Market Science, Inc.

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA

You can contact Health Market Science at info@healthmarketscience.com
or at the following address:

Health Market Science
2700 Horizon Drive
Suite 200
King of Prussia, PA 19406
*/

package com.healthmarketscience.jackcess.scsu;

import junit.framework.TestCase;

/**
* @author James Ahlborn
*/
public class CompressTest extends TestCase
{

public CompressTest(String name) throws Exception {
super(name);
}

public void testCompression() throws Exception
{
CompressMain.suiteTest();
}

}

正在加载...
取消
保存