rework unicode compression support, fixes issue 111

author James Ahlborn <jtahlborn@yahoo.com>

Sat, 15 Nov 2014 04:06:17 +0000 (04:06 +0000)

committer James Ahlborn <jtahlborn@yahoo.com>

Sat, 15 Nov 2014 04:06:17 +0000 (04:06 +0000)
author James Ahlborn <jtahlborn@yahoo.com>
Sat, 15 Nov 2014 04:06:17 +0000 (04:06 +0000)
committer James Ahlborn <jtahlborn@yahoo.com>
Sat, 15 Nov 2014 04:06:17 +0000 (04:06 +0000)
diff --git a/pom.xml b/pom.xml

index 1016d38ea46d23fcac5e4c141db5338835cf334a..e24a0acdd63ff40a082c488dee0fd7d56eba84f4 100644 (file)
--- a/pom.xml
+++ b/pom.xml
@@ -160,9 +160,6 @@
          <artifactId>cobertura-maven-plugin</artifactId>
          <configuration>
            <instrumentation>
-            <excludes>
-              <exclude>com/healthmarketscience/jackcess/impl/scsu/**</exclude>
-            </excludes>
            </instrumentation>
          </configuration>
          <executions>
@@ -269,7 +266,6 @@
              <list>http://docs.oracle.com/javaee/5/api/</list>
            </links>
            <source>1.5</source>
-          <excludePackageNames>com.healthmarketscience.jackcess.impl.scsu</excludePackageNames>
            <show>public</show>
            <stylesheetfile>${basedir}/src/site/javadoc/stylesheet.css</stylesheetfile>
            <tags>
diff --git a/src/changes/changes.xml b/src/changes/changes.xml

index 37d98a9b09b01b6b2a3aeb66a4f6d362961a482c..22ab6ae5b362e33b1233b179635d34747e903656 100644 (file)
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -4,6 +4,12 @@
      <author email="javajedi@users.sf.net">Tim McCune</author>
    </properties>
    <body>
+    <release version="2.0.7" date="TBD">
+      <action dev="jahlborn" type="fix" system="SourceForge2" issue="111">
+        Unicode compression support was not correct for all possibly
+        compressed characters.
+      </action>
+    </release>
      <release version="2.0.6" date="2014-10-04">
        <action dev="jahlborn" type="fix" system="SourceForge2" issue="109">
          IndexCursor can early exit when searching based on indexed values.
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java b/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java

index 5e3fe88049d5ec3c8905ca2b55cde3a47cf1ae09..224348a2ff29702b9cac0be5ff0bf08348caee3d 100644 (file)
--- a/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java
+++ b/src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java
@@ -60,10 +60,6 @@ import com.healthmarketscience.jackcess.complex.ComplexColumnInfo;
  import com.healthmarketscience.jackcess.complex.ComplexValue;
  import com.healthmarketscience.jackcess.complex.ComplexValueForeignKey;
  import com.healthmarketscience.jackcess.impl.complex.ComplexValueForeignKeyImpl;
-import com.healthmarketscience.jackcess.impl.scsu.Compress;
-import com.healthmarketscience.jackcess.impl.scsu.EndOfInputException;
-import com.healthmarketscience.jackcess.impl.scsu.Expand;
-import com.healthmarketscience.jackcess.impl.scsu.IllegalInputException;
  import com.healthmarketscience.jackcess.util.ColumnValidator;
  import com.healthmarketscience.jackcess.util.SimpleColumnValidator;
  import org.apache.commons.lang.builder.ToStringBuilder;
@@ -163,6 +159,8 @@ public class ColumnImpl implements Column, Comparable<ColumnImpl> {
    /** header used to indicate unicode text compression */
    private static final byte[] TEXT_COMPRESSION_HEADER = 
    { (byte)0xFF, (byte)0XFE };
+  private static final char MIN_COMPRESS_CHAR = 1;
+  private static final char MAX_COMPRESS_CHAR = 0xFF;
  
    
    /** owning table */
@@ -1110,57 +1108,44 @@ public class ColumnImpl implements Column, Comparable<ColumnImpl> {
    String decodeTextValue(byte[] data)
      throws IOException
    {
-    try {
-
-      // see if data is compressed.  the 0xFF, 0xFE sequence indicates that
-      // compression is used (sort of, see algorithm below)
-      boolean isCompressed = ((data.length > 1) &&
-                              (data[0] == TEXT_COMPRESSION_HEADER[0]) &&
-                              (data[1] == TEXT_COMPRESSION_HEADER[1]));
-
-      if(isCompressed) {
+    // see if data is compressed.  the 0xFF, 0xFE sequence indicates that
+    // compression is used (sort of, see algorithm below)
+    boolean isCompressed = ((data.length > 1) &&
+                            (data[0] == TEXT_COMPRESSION_HEADER[0]) &&
+                            (data[1] == TEXT_COMPRESSION_HEADER[1]));
  
-        Expand expander = new Expand();
+    if(isCompressed) {
          
-        // this is a whacky compression combo that switches back and forth
-        // between compressed/uncompressed using a 0x00 byte (starting in
-        // compressed mode)
-        StringBuilder textBuf = new StringBuilder(data.length);
-        // start after two bytes indicating compression use
-        int dataStart = TEXT_COMPRESSION_HEADER.length;
-        int dataEnd = dataStart;
-        boolean inCompressedMode = true;
-        while(dataEnd < data.length) {
-          if(data[dataEnd] == (byte)0x00) {
-
-            // handle current segment
-            decodeTextSegment(data, dataStart, dataEnd, inCompressedMode,
-                              expander, textBuf);
-            inCompressedMode = !inCompressedMode;
-            ++dataEnd;
-            dataStart = dataEnd;
+      // this is a whacky compression combo that switches back and forth
+      // between compressed/uncompressed using a 0x00 byte (starting in
+      // compressed mode)
+      StringBuilder textBuf = new StringBuilder(data.length);
+      // start after two bytes indicating compression use
+      int dataStart = TEXT_COMPRESSION_HEADER.length;
+      int dataEnd = dataStart;
+      boolean inCompressedMode = true;
+      while(dataEnd < data.length) {
+        if(data[dataEnd] == (byte)0x00) {
+
+          // handle current segment
+          decodeTextSegment(data, dataStart, dataEnd, inCompressedMode,
+                            textBuf);
+          inCompressedMode = !inCompressedMode;
+          ++dataEnd;
+          dataStart = dataEnd;
              
-          } else {
-            ++dataEnd;
-          }
+        } else {
+          ++dataEnd;
          }
-        // handle last segment
-        decodeTextSegment(data, dataStart, dataEnd, inCompressedMode,
-                          expander, textBuf);
+      }
+      // handle last segment
+      decodeTextSegment(data, dataStart, dataEnd, inCompressedMode, textBuf);
  
-        return textBuf.toString();
+      return textBuf.toString();
          
-      }
-      
-      return decodeUncompressedText(data, getCharset());
-      
-    } catch (IllegalInputException e) {
-      throw (IOException)
-        new IOException("Can't expand text column").initCause(e);
-    } catch (EndOfInputException e) {
-      throw (IOException)
-        new IOException("Can't expand text column").initCause(e);
      }
+      
+    return decodeUncompressedText(data, getCharset());
    }
  
    /**
@@ -1168,25 +1153,29 @@ public class ColumnImpl implements Column, Comparable<ColumnImpl> {
     * given status of the segment (compressed/uncompressed).
     */
    private void decodeTextSegment(byte[] data, int dataStart, int dataEnd,
-                                 boolean inCompressedMode, Expand expander,
+                                 boolean inCompressedMode, 
                                   StringBuilder textBuf)
-    throws IllegalInputException, EndOfInputException
    {
      if(dataEnd <= dataStart) {
        // no data
        return;
      }
      int dataLength = dataEnd - dataStart;
+
      if(inCompressedMode) {
-      // handle compressed data
-      byte[] tmpData = ByteUtil.copyOf(data, dataStart, dataLength);
-      expander.reset();
-      textBuf.append(expander.expand(tmpData));
-    } else {
-      // handle uncompressed data
-      textBuf.append(decodeUncompressedText(data, dataStart, dataLength,
-                                            getCharset()));
+      byte[] tmpData = new byte[dataLength * 2];
+      int tmpIdx = 0;
+      for(int i = dataStart; i < dataEnd; ++i) {
+        tmpData[tmpIdx] = data[i];
+        tmpIdx += 2;
+      } 
+      data = tmpData;
+      dataStart = 0;
+      dataLength = data.length;
      }
+
+    textBuf.append(decodeUncompressedText(data, dataStart, dataLength,
+                                          getCharset()));
    }
  
    /**
@@ -1215,41 +1204,37 @@ public class ColumnImpl implements Column, Comparable<ColumnImpl> {
      
      // may only compress if column type allows it
      if(!forceUncompressed && isCompressedUnicode() &&
-       (text.length() <= getFormat().MAX_COMPRESSED_UNICODE_SIZE)) {
-
-      // for now, only do very simple compression (only compress text which is
-      // all ascii text)
-      if(isAsciiCompressible(text)) {
-
-        byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length + 
-                                       text.length()];
-        encodedChars[0] = TEXT_COMPRESSION_HEADER[0];
-        encodedChars[1] = TEXT_COMPRESSION_HEADER[1];
-        for(int i = 0; i < text.length(); ++i) {
-          encodedChars[i + TEXT_COMPRESSION_HEADER.length] = 
-            (byte)text.charAt(i);
-        }
-        return ByteBuffer.wrap(encodedChars);
+       (text.length() <= getFormat().MAX_COMPRESSED_UNICODE_SIZE) &&
+       isUnicodeCompressible(text)) {
+
+      byte[] encodedChars = new byte[TEXT_COMPRESSION_HEADER.length + 
+                                     text.length()];
+      encodedChars[0] = TEXT_COMPRESSION_HEADER[0];
+      encodedChars[1] = TEXT_COMPRESSION_HEADER[1];
+      for(int i = 0; i < text.length(); ++i) {
+        encodedChars[i + TEXT_COMPRESSION_HEADER.length] = 
+          (byte)text.charAt(i);
        }
+      return ByteBuffer.wrap(encodedChars);
      }
  
      return encodeUncompressedText(text, getCharset());
    }
  
    /**
-   * Returns {@code true} if the given text can be compressed using simple
-   * ASCII encoding, {@code false} otherwise.
+   * Returns {@code true} if the given text can be compressed using compressed
+   * unicode, {@code false} otherwise.
     */
-  private static boolean isAsciiCompressible(CharSequence text) {
+  private static boolean isUnicodeCompressible(CharSequence text) {
      // only attempt to compress > 2 chars (compressing less than 3 chars would
      // not result in a space savings due to the 2 byte compression header)
      if(text.length() <= TEXT_COMPRESSION_HEADER.length) {
        return false;
      }
-    // now, see if it is all printable ASCII
+    // now, see if it is all compressible characters
      for(int i = 0; i < text.length(); ++i) {
        char c = text.charAt(i);
-      if(!Compress.isAsciiCrLfOrTab(c)) {
+      if((c < MIN_COMPRESS_CHAR) || (c > MAX_COMPRESS_CHAR)) {
          return false;
        }
      }
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java

deleted file mode 100644 (file)

index 9428075..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java
+++ /dev/null
@@ -1,628 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- *  @author Asmus Freytag
- *
- *  @version 001 Dec 25 1996
- *  @version 002 Jun 25 1997
- *  @version 003 Jul 25 1997
- *  @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
-/**
-    This class implements a simple compression algorithm
- **/
-/*
-    Note on exception handling
-        This compressor is designed so that it can be restarted after
-        an exception. All operations advancing input and/or output cursor
-        (iIn and iOut) either complete an action, or set a state (fUnicodeMode)
-        before updating the cursors.
-*/
-public class Compress extends SCSU
-{
-
-    /** next input character to be read **/
-    private int iIn;
-
-    /** next output byte to be written **/
-    private int iOut;
-
-    /** start index of Unicode mode in output array, or -1 if in single byte mode **/
-    private int iSCU = -1;
-
-    /** true if the next command byte is of the Uxx family */
-    private boolean fUnicodeMode = false;
-
-    /** locate a window for a character given a table of offsets
-    @param ch - character
-    @param offsetTable - table of window offsets
-    @return true if the character fits a window from the table of windows */
-    private boolean locateWindow(int ch, int[] offsetTable)
-    {
-        // always try the current window first
-        int iWin = getCurrentWindow();
-
-        // if the character fits the current window
-        // just use the current window
-        if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
-        {
-            return true;
-        }
-
-        // try all windows in order
-        for (iWin = 0; iWin < offsetTable.length; iWin++)
-        {
-            if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
-            {
-                selectWindow(iWin);
-                return true;
-            }
-        }
-        // none found
-        return false;
-    }
-
-    /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
-    public static boolean isAsciiCrLfOrTab(int ch)
-    {
-        return    (ch >= 0x20 && ch <= 0x7F)                 // ASCII
-                || ch == 0x09 || ch == 0x0A || ch == 0x0D;   // CR/LF or TAB
-
-    }
-
-    /** output a run of characters in single byte mode
-        In single byte mode pass through characters in the ASCII range, but
-        quote characters overlapping with compression command codes. Runs
-        of characters fitting the current window are output as runs of bytes
-        in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
-        Uses and updates the current input and output cursors store in
-        the instance variables <i>iIn</i> and <i>iOut</i>.
-        @param in - input character array
-        @param out - output byte array
-        @return the next chaacter to be processed. This may be an extended character.
-    **/
-    @SuppressWarnings("fallthrough")
-    public int outputSingleByteRun(char [] in, byte [] out)
-        throws EndOfOutputException, EndOfInputException, IllegalInputException
-    {
-        int iWin = getCurrentWindow();
-        while(iIn < in.length)
-        {
-            int outlen = 0;
-            byte byte1 = 0;
-            byte byte2 = 0;
-
-            // get the input character
-            int ch = in[iIn];
-
-            int inlen = 1;
-
-            // Check input for Surrogate pair
-            if ( (ch & 0xF800) == 0xD800 )
-            {
-                if ( (ch & 0xFC00) == 0xDC00 )
-                {
-                    // low surrogate out of order
-                    throw new IllegalInputException("Unpaired low surrogate: "+iIn);
-                }
-                else
-                {
-                    // have high surrogate now get low surrogate
-                    if ( iIn >= in.length-1)
-                    {
-                        // premature end of input
-                        throw new EndOfInputException();
-                    }
-                    // get the char
-                    int ch2 = in[iIn+1];
-
-                    // make sure it's a low surrogate
-                    if ( (ch2 & 0xFC00) != 0xDC00 )
-                    {
-                        // a low surrogate was required
-                        throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1));
-                    }
-
-                    // combine the two values
-                    ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
-                    // ch = ch<<10 + ch2 - 0x36F0000;
-
-                    inlen = 2;
-                 }
-            }
-
-            // ASCII Letter, NUL, CR, LF and TAB are always passed through
-            if (isAsciiCrLfOrTab(ch) || ch == 0)
-            {
-                // pass through directcly
-                byte2 = (byte)(ch & 0x7F);
-                outlen = 1;
-            }
-
-            // All other control codes must be quoted
-            else if (ch < 0x20)
-            {
-                byte1 = SQ0;
-                byte2 = (byte)(ch);
-                outlen = 2;
-            }
-
-            // Letters that fit the current dynamic window
-            else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
-            {
-                ch -= dynamicOffset[iWin];
-                byte2 = (byte)(ch | 0x80);
-                outlen = 1;
-            }
-
-            // check for room in the output array
-            if (iOut + outlen >= out.length)
-            {
-                throw new EndOfOutputException();
-            }
-
-            switch(outlen)
-            {
-                default:
-                    // need to use some other compression mode for this
-                    // character so we terminate this loop
-
-                    return ch; // input not finished
-
-                    // output the characters
-                case 2:
-                    out[iOut++] = byte1;
-                    // fall through
-                case 1:
-                    out[iOut++] = byte2;
-                    break;
-            }
-            // advance input pointer
-            iIn += inlen;
-        }
-        return 0; // input all used up
-    }
-
-    /** quote a single character in single byte mode
-    Quoting a character (aka 'non-locking shift') gives efficient access
-    to characters that occur in isolation--usually punctuation characters.
-    When quoting a character from a dynamic window use 0x80 - 0xFF, when
-    quoting a character from a static window use 0x00-0x7f.
-    @param ch - character to be quoted
-    @param out - output byte array
-    **/
-
-    private void quoteSingleByte(int ch, byte [] out)
-        throws EndOfOutputException
-    {
-        Debug.out("Quoting SingleByte ", ch);
-        int iWin = getCurrentWindow();
-
-        // check for room in the output array
-        if (iOut >= out.length -2)
-        {
-            throw new EndOfOutputException();
-        }
-
-        // Output command byte followed by
-        out[iOut++] = (byte)(SQ0 + iWin);
-
-        // Letter that fits the current dynamic window
-        if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
-        {
-            ch -= dynamicOffset[iWin];
-            out[iOut++] = (byte)(ch | 0x80);
-        }
-
-        // Letter that fits the current static window
-        else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80)
-        {
-            ch -= staticOffset[iWin];
-            out[iOut++] = (byte)ch;
-        }
-        else
-        {
-            throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error");
-        }
-        // advance input pointer
-        iIn ++;
-        Debug.out("New input: ", iIn);
-    }
-
-    /** output a run of characters in Unicode mode
-    A run of Unicode mode consists of characters which are all in the
-    range of non-compressible characters or isolated occurrence
-    of any other characters. Characters in the range 0xE00-0xF2FF must
-    be quoted to avoid overlap with the Unicode mode compression command codes.
-    Uses and updates the current input and output cursors store in
-    the instance variables <i>iIn</i> and <i>iOut</i>.
-    NOTE: Characters from surrogate pairs are passed through and unlike single
-    byte mode no checks are made for unpaired surrogate characters.
-    @param in - input character array
-    @param out - output byte array
-    @return the next input character to be processed
-    **/
-    public char outputUnicodeRun(char [] in, byte [] out)
-        throws EndOfOutputException
-    {
-        // current character
-        char ch = 0;
-
-        while(iIn < in.length)
-        {
-            // get current input and set default output length
-            ch = in[iIn];
-            int outlen = 2;
-
-            // Characters in these ranges could potentially be compressed.
-            // We require 2 or more compressible characters to break the run
-            if (isCompressible(ch))
-            {
-                // check whether we can look ahead
-                if( iIn < in.length - 1)
-                {
-                    // DEBUG
-                    Debug.out("is-comp: ",ch);
-                    char ch2 = in[iIn + 1];
-                    if (isCompressible(ch2))
-                    {
-                        // at least 2 characters are compressible
-                        // break the run
-                        break;
-                    }
-                    //DEBUG
-                    Debug.out("no-comp: ",ch2);
-                }
-                // If we get here, the current character is only character
-                // left in the input or it is followed by a non-compressible
-                // character. In neither case do we gain by breaking the
-                // run, so we proceed to output the character.
-                if (ch >= 0xE000 && ch <= 0xF2FF)
-                {
-                    // Characters in this range need to be escaped
-                    outlen = 3;
-                }
-
-            }
-            // check that there is enough room to output the character
-            if(iOut >= out.length - outlen)
-            {
-                // DEBUG
-                Debug.out("End of Output @", iOut);
-                // if we got here, we ran out of space in the output array
-                throw new EndOfOutputException();
-            }
-
-            // output any characters that cannot be compressed,
-            if (outlen == 3)
-            {
-                // output the quote character
-                out[iOut++] = UQU;
-            }
-            // pass the Unicode character in MSB,LSB order
-            out[iOut++] = (byte)(ch >>> 8);
-            out[iOut++] = (byte)(ch & 0xFF);
-
-            // advance input cursor
-            iIn++;
-        }
-
-        // return the last character
-        return ch;
-    }
-
-    static int iNextWindow = 3;
-
-    /** redefine a window so it surrounds a given character value
-        For now, this function uses window 3 exclusively (window 4
-        for extended windows);
-        @return true if a window was successfully defined
-        @param ch - character around which window is positioned
-        @param out - output byte array
-        @param fCurUnicodeMode - type of window
-     **/
-    private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode)
-        throws IllegalInputException, EndOfOutputException
-    {
-        int iWin = iNextWindow % 8; // simple LRU
-        int iPosition = 0;
-
-        // iPosition 0 is a reserved value
-        if (ch < 0x80)
-        {
-            throw new IllegalStateException("ch < 0x80");
-            //return false;
-        }
-
-        // Check the fixed offsets
-        for (int i = 0; i < fixedOffset.length; i++)
-        {
-            if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80)
-            {
-                iPosition = i;
-                break;
-            }
-        }
-
-        if (iPosition != 0)
-        {
-            // DEBUG
-            Debug.out("FIXED position is ", iPosition + 0xF9);
-
-            // ch fits in a fixed offset window position
-            dynamicOffset[iWin] = fixedOffset[iPosition];
-            iPosition += 0xF9;
-        }
-        else if (ch < 0x3400)
-        {
-            // calculate a window position command and set the offset
-            iPosition = ch >>> 7;
-            dynamicOffset[iWin] = ch & 0xFF80;
-
-            Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
-        }
-        else if (ch < 0xE000)
-        {
-            // attempt to place a window where none can go
-            return false;
-        }
-        else if (ch <= 0xFFFF)
-        {
-            // calculate a window position command, accounting
-            // for the gap in position values, and set the offset
-            iPosition =  ((ch - gapOffset)>>> 7);
-
-            dynamicOffset[iWin] = ch & 0xFF80;
-
-            Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
-        }
-        else
-        {
-            // if we get here, the character is in the extended range.
-            // Always use Window 4 to define an extended window
-
-            iPosition = (ch - 0x10000) >>> 7;
-            // DEBUG
-            Debug.out("Try position Window at ", iPosition);
-
-            iPosition |= iWin << 13;
-            dynamicOffset[iWin] = ch & 0x1FFF80;
-        }
-
-        // Outputting window defintion command for the general cases
-        if ( iPosition < 0x100 && iOut < out.length-1)
-        {
-            out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin);
-            out[iOut++] = (byte) (iPosition & 0xFF);
-        }
-        // Output an extended window definiton command
-        else if ( iPosition >= 0x100 && iOut < out.length - 2)
-        {
-
-            Debug.out("Setting extended window at ", iPosition);
-            out[iOut++] = (fCurUnicodeMode ? UDX : SDX);
-            out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
-            out[iOut++] = (byte) (iPosition & 0xFF);
-        }
-        else
-        {
-            throw new EndOfOutputException();
-        }
-        selectWindow(iWin);
-        iNextWindow++;
-        return true;
-    }
-
-    /**
-    compress a Unicode character array with some simplifying assumptions
-    **/
-    public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut)
-        throws IllegalInputException, EndOfInputException, EndOfOutputException
-    {
-        iIn = iStartIn;
-        iOut = iStartOut;
-
-
-        while (iIn < in.length)
-        {
-            int ch;
-
-            // previously we switched to a Unicode run
-            if (iSCU != -1)
-            {
-
-                Debug.out("Remaining", in, iIn);
-                Debug.out("Output until ["+iOut+"]: ", out);
-
-                // output characters as Unicode
-                ch = outputUnicodeRun(in, out);
-
-                // for single character Unicode runs (3 bytes) use quote
-                if (iOut - iSCU == 3 )
-                {
-                    // go back and fix up the SCU to an SQU instead
-                    out[iSCU] = SQU;
-                    iSCU = -1;
-                    continue;
-                }
-                else
-                {
-                    iSCU = -1;
-                    fUnicodeMode = true;
-                }
-            }
-            // next, try to output characters as single byte run
-            else
-            {
-                ch = outputSingleByteRun(in, out);
-            }
-
-            // check whether we still have input
-            if (iIn == in.length)
-            {
-                break; // no more input
-            }
-
-            // if we get here, we have a consistent value for ch, whether or
-            // not it is an regular or extended character. Locate or define a
-            // Window for the current character
-
-            Debug.out("Output so far: ", out);
-            Debug.out("Routing ch="+ch+" for Input", in, iIn);
-
-            // Check that we have enough room to output the command byte
-            if (iOut >= out.length - 1)
-            {
-                throw new EndOfOutputException();
-            }
-
-            // In order to switch away from Unicode mode, it is necessary
-            // to select (or define) a window. If the characters that follow
-            // the Unicode range are ASCII characters, we can't use them
-            // to decide which window to select, since ASCII characters don't
-            // influence window settings. This loop looks ahead until it finds
-            // one compressible character that isn't in the ASCII range.
-            for (int ich = iIn; ch < 0x80; ich++)
-            {
-                if (ich == in.length || !isCompressible(in[ich]))
-                {
-                    // if there are only ASCII characters left,
-                    ch = in[iIn];
-                    break;
-                }
-                ch = in[ich]; // lookahead for next non-ASCII char
-            }
-            // The character value contained in ch here will only be used to select
-            // output modes. Actual output of characters starts with in[iIn] and
-            // only takes place near the top of the loop.
-
-            int iprevWindow = getCurrentWindow();
-
-            // try to locate a dynamic window
-            if (ch < 0x80 || locateWindow(ch, dynamicOffset))
-            {
-                Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1);
-                // lookahead to use SQn instead of SCn for single
-                // character interruptions of runs in current window
-                if(!fUnicodeMode && iIn < in.length -1)
-                {
-                    char ch2 = in[iIn+1];
-                    if (ch2 >= dynamicOffset[iprevWindow] &&
-                        ch2 <  dynamicOffset[iprevWindow] + 0x80)
-                    {
-                        quoteSingleByte(ch, out);
-                        selectWindow(iprevWindow);
-                        continue;
-                    }
-                }
-
-                out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
-                fUnicodeMode = false;
-            }
-            // try to locate a static window
-            else if (!fUnicodeMode && locateWindow(ch, staticOffset))
-            {
-                // static windows are not accessible from Unicode mode
-                Debug.out("located a static window", getCurrentWindow());
-                quoteSingleByte(ch, out);
-                selectWindow(iprevWindow); // restore current Window settings
-                continue;
-            }
-            // try to define a window around ch
-            else if (positionWindow(ch, out, fUnicodeMode) )
-            {
-                fUnicodeMode = false;
-            }
-            // If all else fails, start a Unicode run
-            else
-            {
-                iSCU = iOut;
-                out[iOut++] = SCU;
-                continue;
-            }
-        }
-
-        return iOut - iStartOut;
-    }
-
-    public byte[] compress(String inStr)
-        throws IllegalInputException, EndOfInputException
-    {
-        // Running out of room for output can cause non-optimal
-        // compression. In order to not slow down compression too
-        // much, not all intermediate state is constantly saved.
-
-        byte [] out = new byte[inStr.length() * 2];
-        char [] in = inStr.toCharArray();
-        //DEBUG
-        Debug.out("compress input: ",in);
-        reset();
-        while(true)
-        {
-            try
-            {
-                simpleCompress(in, charsRead(), out, bytesWritten());
-                // if we get here things went fine.
-                break;
-            }
-            catch (EndOfOutputException e)
-            {
-                // create a larger output buffer and continue
-                byte [] largerOut = new byte[out.length * 2];
-                System.arraycopy(out, 0, largerOut, 0, out.length);
-                out = largerOut;
-            }
-        }
-        byte [] trimmedOut = new byte[bytesWritten()];
-        System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
-        out = trimmedOut;
-
-        Debug.out("compress output: ", out);
-        return out;
-    }
-
-    /** reset is only needed to bail out after an exception and
-        restart with new input */
-    @Override
-    public void reset()
-    {
-        super.reset();
-        fUnicodeMode = false;
-        iSCU = - 1;
-    }
-
-    /** returns the number of bytes written **/
-    public int bytesWritten()
-    {
-        return iOut;
-    }
-
-    /** returns the number of bytes written **/
-    public int charsRead()
-    {
-        return iIn;
-    }
-
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java

deleted file mode 100644 (file)

index c973765..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java
+++ /dev/null
@@ -1,151 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-/*
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- *  @author Asmus Freytag
- *
- *  @version 001 Dec 25 1996
- *  @version 002 Jun 25 1997
- *  @version 003 Jul 25 1997
- *  @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
-/**
- * A number of helpful output routines for debugging. Output can be
- * centrally enabled or disabled by calling Debug.set(true/false);
- * All methods are statics;
- */
-
-public class Debug
-{
-  
-    private static final Log LOG = LogFactory.getLog(Debug.class); 
-  
-    // debugging helper
-    public static void out(char [] chars)
-    {
-         out(chars, 0);
-    }
-
-    public static void out(char [] chars, int iStart)
-    {
-        if (!LOG.isDebugEnabled()) return;
-        StringBuilder msg = new StringBuilder();
-
-        for (int i = iStart; i < chars.length; i++)
-        {
-            if (chars[i] >= 0 && chars[i] <= 26)
-            {
-                msg.append("^"+(char)(chars[i]+0x40));
-            }
-            else if (chars[i] <= 255)
-            {
-                msg.append(chars[i]);
-            }
-            else
-            {
-                msg.append("\\u"+Integer.toString(chars[i],16));
-            }
-        }
-        LOG.debug(msg.toString());
-    }
-
-    public static void out(byte [] bytes)
-    {
-        out(bytes, 0);
-    }
-    public static void out(byte [] bytes, int iStart)
-    {
-        if (!LOG.isDebugEnabled()) return;
-        StringBuilder msg = new StringBuilder();
-
-        for (int i = iStart; i < bytes.length; i++)
-        {
-            msg.append(bytes[i]+",");
-        }
-        LOG.debug(msg.toString());
-    }
-
-    public static void out(String str)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(str);
-    }
-
-    public static void out(String msg, int iData)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(msg + iData);
-    }
-    public static void out(String msg, char ch)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(msg + "[U+"+Integer.toString(ch,16)+"]" + ch);
-    }
-    public static void out(String msg, byte bData)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(msg + bData);
-    }
-    public static void out(String msg, String str)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(msg + str);
-    }
-    public static void out(String msg, char [] data)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(msg);
-        out(data);
-    }
-    public static void out(String msg, byte [] data)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(msg);
-        out(data);
-    }
-    public static void out(String msg, char [] data, int iStart)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(msg +"("+iStart+"): ");
-        out(data, iStart);
-    }
-    public static void out(String msg, byte [] data, int iStart)
-    {
-        if (!LOG.isDebugEnabled()) return;
-
-        LOG.debug(msg+"("+iStart+"): ");
-        out(data, iStart);
-    }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java

deleted file mode 100644 (file)

index b3148a7..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java
+++ /dev/null
@@ -1,49 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- *  @author Asmus Freytag
- *
- *  @version 001 Dec 25 1996
- *  @version 002 Jun 25 1997
- *  @version 003 Jul 25 1997
- *  @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-/**
- * The input string or input byte array ended prematurely
- *
- */
-public class EndOfInputException
-    extends java.lang.Exception
-{
-  
-   private static final long serialVersionUID = 1L;
-  
-   public EndOfInputException(){
-    super("The input string or input byte array ended prematurely");
-    }
-
-    public EndOfInputException(String s) {
-       super(s);
-    }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java

deleted file mode 100644 (file)

index 94f5be6..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- *  @author Asmus Freytag
- *
- *  @version 001 Dec 25 1996
- *  @version 002 Jun 25 1997
- *  @version 003 Jul 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-/**
- * The input string or input byte array ended prematurely
- */
-public class EndOfOutputException
-    extends java.lang.Exception
-
-{
-
-   private static final long serialVersionUID = 1L;
-  
-   public EndOfOutputException(){
-    super("The input string or input byte array ended prematurely");
-    }
-
-    public EndOfOutputException(String s) {
-       super(s);
-    }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java

deleted file mode 100644 (file)

index 378ca2f..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java
+++ /dev/null
@@ -1,431 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/*
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- *  @author Asmus Freytag
- *
- *  @version 001 Dec 25 1996
- *  @version 002 Jun 25 1997
- *  @version 003 Jul 25 1997
- *  @version 004 Aug 25 1997
- *  @version 005 Sep 30 1998  
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
- /**
-    Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
-
-    <H2>Notes on the Java implementation</H2>
-
-    A limitation of Java is the exclusive use of a signed byte data type.
-    The following work arounds are required:
-
-    Copying a byte to an integer variable and adding 256 for 'negative'
-    bytes gives an integer in the range 0-255.
-
-    Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
-    char values is unsigned.
-
-    Extended characters require an int to store them. The sign is not an
-    issue because only 1024*1024 + 65536 extended characters exist.
-
-**/
-public class Expand extends SCSU
-{
-    /** (re-)define (and select) a dynamic window
-    A sliding window position cannot start at any Unicode value,
-    so rather than providing an absolute offset, this function takes
-    an index value which selects among the possible starting values.
-
-    Most scripts in Unicode start on or near a half-block boundary
-    so the default behaviour is to multiply the index by 0x80. Han,
-    Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
-    show very poor locality--therefore no sliding window can be set
-    there. A jumpOffset is added to the index value to skip that region,
-    and only 167 index values total are required to select all eligible
-    half-blocks.
-
-    Finally, a few scripts straddle half block boundaries. For them, a
-    table of fixed offsets is used, and the index values from 0xF9 to
-    0xFF are used to select these special offsets.
-
-    After (re-)defining a windows location it is selected so it is ready
-    for use.
-
-    Recall that all Windows are of the same length (128 code positions).
-
-    @param iWindow - index of the window to be (re-)defined
-    @param bOffset - index for the new offset value
-    **/
-       // @005 protected <-- private here and elsewhere
-    protected void defineWindow(int iWindow, byte bOffset)
-        throws IllegalInputException
-    {
-        int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
-
-        // 0 is a reserved value
-        if (iOffset == 0)
-        {
-            throw new IllegalInputException();
-        }
-        else if (iOffset < gapThreshold)
-        {
-            dynamicOffset[iWindow] = iOffset << 7;
-        }
-        else if (iOffset < reservedStart)
-        {
-            dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
-        }
-        else if (iOffset < fixedThreshold)
-        {
-            // more reserved values
-            throw new IllegalInputException("iOffset == "+iOffset);
-        }
-        else
-        {
-            dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
-        }
-
-        // make the redefined window the active one
-        selectWindow(iWindow);
-    }
-
-    /** (re-)define (and select) a window as an extended dynamic window
-    The surrogate area in Unicode allows access to 2**20 codes beyond the
-    first 64K codes by combining one of 1024 characters from the High
-    Surrogate Area with one of 1024 characters from the Low Surrogate
-    Area (see Unicode 2.0 for the details).
-
-    The tags SDX and UDX set the window such that each subsequent byte in
-    the range 80 to FF represents a surrogate pair. The following diagram
-    shows how the bits in the two bytes following the SDX or UDX, and a
-    subsequent data byte, map onto the bits in the resulting surrogate pair.
-
-     hbyte         lbyte          data
-    nnnwwwww      zzzzzyyy      1xxxxxxx
-
-     high-surrogate     low-surrogate
-    110110wwwwwzzzzz   110111yyyxxxxxxx
-
-    @param chOffset - Since the three top bits of chOffset are not needed to
-    set the location of the extended Window, they are used instead
-    to select the window, thereby reducing the number of needed command codes.
-    The bottom 13 bits of chOffset are used to calculate the offset relative to
-    a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
-    **/
-    protected void defineExtendedWindow(char chOffset)
-    {
-        // The top 3 bits of iOffsetHi are the window index
-        int iWindow = chOffset >>> 13;
-
-        // Calculate the new offset
-        dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
-
-        // make the redefined window the active one
-        selectWindow(iWindow);
-    }
-
-    /** string buffer length used by the following functions */
-    protected int iOut = 0;
-
-    /** input cursor used by the following functions */
-    protected int iIn = 0;
-
-    /** expand input that is in Unicode mode
-    @param in input byte array to be expanded
-    @param iCur starting index
-    @param sb string buffer to which to append expanded input
-    @return the index for the lastc byte processed
-    **/
-    protected int expandUnicode(byte []in, int iCur, StringBuilder sb)
-        throws IllegalInputException, EndOfInputException
-    {
-        for( ; iCur < in.length-1; iCur+=2 ) // step by 2:
-        {
-            byte b = in[iCur];
-
-            if (b >= UC0 && b <= UC7)
-            {
-                Debug.out("SelectWindow: ", b);
-                selectWindow(b - UC0);
-                return iCur;
-            }
-            else if (b >= UD0 && b <= UD7)
-            {
-                defineWindow( b - UD0, in[iCur+1]);
-                return iCur + 1;
-            }
-            else if (b == UDX)
-            {
-                if( iCur >= in.length - 2)
-                {
-                    break; // buffer error
-                }
-                defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2]));
-                return iCur + 2;
-            }
-            else if (b == UQU)
-            {
-                if( iCur >= in.length - 2)
-                {
-                    break; // error
-                }
-                // Skip command byte and output Unicode character
-                iCur++;
-            }
-
-            // output a Unicode character
-            char ch = charFromTwoBytes(in[iCur], in[iCur+1]);
-            sb.append(ch);
-            iOut++;
-        }
-
-        if( iCur == in.length)
-        {
-            return iCur;
-        }
-
-        // Error condition
-        throw new EndOfInputException();
-    }
-
-    /** assemble a char from two bytes
-    In Java bytes are signed quantities, while chars are unsigned
-    @return the character
-    @param hi most significant byte
-    @param lo least significant byte
-    */
-    public static char charFromTwoBytes(byte hi, byte lo)
-    {
-        char ch = (char)(lo >= 0 ? lo : 256 + lo);
-        return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8));
-    }
-
-    /** expand portion of the input that is in single byte mode **/
-    @SuppressWarnings("fallthrough")
-    protected String expandSingleByte(byte []in)
-        throws IllegalInputException, EndOfInputException
-    {
-
-        /* Allocate the output buffer. Because of control codes, generally
-        each byte of input results in fewer than one character of
-        output. Using in.length as an intial allocation length should avoid
-        the need to reallocate in mid-stream. The exception to this rule are
-        surrogates. */
-        StringBuilder sb = new StringBuilder(in.length);
-        iOut = 0;
-
-        // Loop until all input is exhausted or an error occurred
-        int iCur;
-        Loop:
-        for( iCur = 0; iCur < in.length; iCur++ )
-        {
-            // DEBUG Debug.out("Expanding: ", iCur);
-
-            // Default behaviour is that ASCII characters are passed through
-            // (staticOffset[0] == 0) and characters with the high bit on are
-            // offset by the current dynamic (or sliding) window (this.iWindow)
-            int iStaticWindow = 0;
-            int iDynamicWindow = getCurrentWindow();
-
-            switch(in[iCur])
-            {
-                // Quote from a static Window
-            case SQ0:
-            case SQ1:
-            case SQ2:
-            case SQ3:
-            case SQ4:
-            case SQ5:
-            case SQ6:
-            case SQ7:
-                Debug.out("SQn:", iStaticWindow);
-                // skip the command byte and check for length
-                if( iCur >= in.length - 1)
-                {
-                    Debug.out("SQn missing argument: ", in, iCur);
-                    break Loop;  // buffer length error
-                }
-                // Select window pair to quote from
-                iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
-                iCur ++;
-
-                // FALL THROUGH
-
-            default:
-                // output as character
-                if(in[iCur] >= 0)
-                {
-                    // use static window
-                    int ch = in[iCur] + staticOffset[iStaticWindow];
-                    sb.append((char)ch);
-                    iOut++;
-                }
-                else
-                {
-                    // use dynamic window
-                    int ch = (in[iCur] + 256); // adjust for signed bytes
-                    ch -= 0x80;                // reduce to range 00..7F
-                    ch += dynamicOffset[iDynamicWindow];
-
-                    //DEBUG
-                    Debug.out("Dynamic: ", (char) ch);
-
-                    if (ch < 1<<16)
-                    {
-                        // in Unicode range, output directly
-                        sb.append((char)ch);
-                        iOut++;
-                    }
-                    else
-                    {
-                        // this is an extension character
-                        Debug.out("Extension character: ", ch);
-
-                        // compute and append the two surrogates:
-                        // translate from 10000..10FFFF to 0..FFFFF
-                        ch -= 0x10000;
-
-                        // high surrogate = top 10 bits added to D800
-                        sb.append((char)(0xD800 + (ch>>10)));
-                        iOut++;
-
-                        // low surrogate = bottom 10 bits added to DC00
-                        sb.append((char)(0xDC00 + (ch & ~0xFC00)));
-                        iOut++;
-                    }
-                }
-                break;
-
-                // define a dynamic window as extended
-            case SDX:
-                iCur += 2;
-                if( iCur >= in.length)
-                {
-                    Debug.out("SDn missing argument: ", in, iCur -1);
-                    break Loop;  // buffer length error
-                }
-                defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur]));
-                break;
-
-                // Position a dynamic Window
-            case SD0:
-            case SD1:
-            case SD2:
-            case SD3:
-            case SD4:
-            case SD5:
-            case SD6:
-            case SD7:
-                iCur ++;
-                if( iCur >= in.length)
-                {
-                    Debug.out("SDn missing argument: ", in, iCur -1);
-                    break Loop;  // buffer length error
-                }
-                defineWindow(in[iCur-1] - SD0, in[iCur]);
-                break;
-
-                // Select a new dynamic Window
-            case SC0:
-            case SC1:
-            case SC2:
-            case SC3:
-            case SC4:
-            case SC5:
-            case SC6:
-            case SC7:
-                selectWindow(in[iCur] - SC0);
-                break;
-            case SCU:
-                // switch to Unicode mode and continue parsing
-                iCur = expandUnicode(in, iCur+1, sb);
-                // DEBUG Debug.out("Expanded Unicode range until: ", iCur);
-                break;
-
-            case SQU:
-                // directly extract one Unicode character
-                iCur += 2;
-                if( iCur >= in.length)
-                {
-                     Debug.out("SQU missing argument: ", in, iCur - 2);
-                     break Loop;  // buffer length error
-                }
-                else
-                {
-                    char ch = charFromTwoBytes(in[iCur-1], in[iCur]);
-
-                    Debug.out("Quoted: ", ch);
-                    sb.append(ch);
-                    iOut++;
-                }
-                break;
-
-             case Srs:
-                throw new IllegalInputException();
-                // break;
-            }
-        }
-
-        if( iCur >= in.length)
-        {
-            //SUCCESS: all input used up
-            sb.setLength(iOut);
-            iIn = iCur;
-            return sb.toString();
-        }
-
-        Debug.out("Length ==" + in.length+" iCur =", iCur);
-        //ERROR: premature end of input
-        throw new EndOfInputException();
-    }
-
-    /** expand a byte array containing compressed Unicode */
-    public String expand (byte []in)
-        throws IllegalInputException, EndOfInputException
-    {
-        String str = expandSingleByte(in);
-        Debug.out("expand output: ", str.toCharArray());
-        return str;
-    }
-
-
-    /** reset is called to start with new input, w/o creating a new
-        instance */
-    @Override
-    public void reset()
-    {
-        iOut = 0;
-        iIn = 0;
-        super.reset();
-    }
-
-    public int charsWritten()
-    {
-        return iOut;
-    }
-
-    public int bytesRead()
-    {
-        return iIn;
-    }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java

deleted file mode 100644 (file)

index b191f56..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- *  @author Asmus Freytag
- *
- *  @version 001 Dec 25 1996
- *  @version 002 Jun 25 1997
- *  @version 003 Jul 25 1997
- *  @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-/**
- * The input character array or input byte array contained
- * illegal sequences of bytes or characters
- */
-public class IllegalInputException extends java.lang.Exception
-{
-  
-   private static final long serialVersionUID = 1L;
-  
-   public IllegalInputException(){
-    super("The input character array or input byte array contained illegal sequences of bytes or characters");
-    }
-
-    public IllegalInputException(String s) {
-       super(s);
-    }
-}
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java b/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java

deleted file mode 100644 (file)

index 7859780..0000000
--- a/src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java
+++ /dev/null
@@ -1,252 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-/*
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- *  @author Asmus Freytag
- *
- *  @version 001 Dec 25 1996
- *  @version 002 Jun 25 1997
- *  @version 003 Jul 25 1997
- *  @version 004 Aug 25 1997
- *  @version 005 Sep 30 1998
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
- /**
-    Encoding text data in Unicode often requires more storage than using
-    an existing 8-bit character set and limited to the subset of characters
-    actually found in the text. The Unicode Compression Algorithm reduces
-    the necessary storage while retaining the universality of Unicode.
-    A full description of the algorithm can be found in document
-    http://www.unicode.org/unicode/reports/tr6.html
-
-    Summary
-
-    The goal of the Unicode Compression Algorithm is the abilty to
-    * Express all code points in Unicode
-    * Approximate storage size for traditional character sets
-    * Work well for short strings
-    * Provide transparency for Latin-1 data
-    * Support very simple decoders
-    * Support simple as well as sophisticated encoders
-
-    If needed, further compression can be achieved by layering standard
-    file or disk-block based compression algorithms on top.
-
-    <H2>Features</H2>
-
-    Languages using small alphabets would contain runs of characters that
-    are coded close together in Unicode. These runs are interrupted only
-    by punctuation characters, which are themselves coded in proximity to
-    each other in Unicode (usually in the ASCII range).
-
-    Two basic mechanisms in the compression algorithm account for these two
-    cases, sliding windows and static windows. A window is an area of 128
-    consecutive characters in Unicode. In the compressed data stream, each
-    character from a sliding window would be represented as a byte between
-    0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
-    TAB) would always mean an ASCII character (or control).
-
-    <H2>Notes on the Java implementation</H2>
-
-    A limitation of Java is the exclusive use of a signed byte data type.
-    The following work arounds are required:
-
-    Copying a byte to an integer variable and adding 256 for 'negative'
-    bytes gives an integer in the range 0-255.
-
-    Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
-    char values is unsigned.
-
-    Extended characters require an int to store them. The sign is not an
-    issue because only 1024*1024 + 65536 extended characters exist.
-
-**/
-public abstract class SCSU
-{
-    /** Single Byte mode command values */
-
-    /** SQ<i>n</i> Quote from Window . <p>
-    If the following byte is less than 0x80, quote from
-    static window <i>n</i>, else quote from dynamic window <i>n</i>.
-    */
-
-    static final byte SQ0 = 0x01; // Quote from window pair 0
-    static final byte SQ1 = 0x02; // Quote from window pair 1
-    static final byte SQ2 = 0x03; // Quote from window pair 2
-    static final byte SQ3 = 0x04; // Quote from window pair 3
-    static final byte SQ4 = 0x05; // Quote from window pair 4
-    static final byte SQ5 = 0x06; // Quote from window pair 5
-    static final byte SQ6 = 0x07; // Quote from window pair 6
-    static final byte SQ7 = 0x08; // Quote from window pair 7
-
-    static final byte SDX = 0x0B; // Define a window as extended
-    static final byte Srs = 0x0C; // reserved
-
-    static final byte SQU = 0x0E; // Quote a single Unicode character
-    static final byte SCU = 0x0F; // Change to Unicode mode
-
-    /** SC<i>n</i> Change to Window <i>n</i>. <p>
-    If the following bytes are less than 0x80, interpret them
-    as command bytes or pass them through, else add the offset
-    for dynamic window <i>n</i>. */
-    static final byte SC0 = 0x10; // Select window 0
-    static final byte SC1 = 0x11; // Select window 1
-    static final byte SC2 = 0x12; // Select window 2
-    static final byte SC3 = 0x13; // Select window 3
-    static final byte SC4 = 0x14; // Select window 4
-    static final byte SC5 = 0x15; // Select window 5
-    static final byte SC6 = 0x16; // Select window 6
-    static final byte SC7 = 0x17; // Select window 7
-    static final byte SD0 = 0x18; // Define and select window 0
-    static final byte SD1 = 0x19; // Define and select window 1
-    static final byte SD2 = 0x1A; // Define and select window 2
-    static final byte SD3 = 0x1B; // Define and select window 3
-    static final byte SD4 = 0x1C; // Define and select window 4
-    static final byte SD5 = 0x1D; // Define and select window 5
-    static final byte SD6 = 0x1E; // Define and select window 6
-    static final byte SD7 = 0x1F; // Define and select window 7
-
-    static final byte UC0 = (byte) 0xE0; // Select window 0
-    static final byte UC1 = (byte) 0xE1; // Select window 1
-    static final byte UC2 = (byte) 0xE2; // Select window 2
-    static final byte UC3 = (byte) 0xE3; // Select window 3
-    static final byte UC4 = (byte) 0xE4; // Select window 4
-    static final byte UC5 = (byte) 0xE5; // Select window 5
-    static final byte UC6 = (byte) 0xE6; // Select window 6
-    static final byte UC7 = (byte) 0xE7; // Select window 7
-    static final byte UD0 = (byte) 0xE8; // Define and select window 0
-    static final byte UD1 = (byte) 0xE9; // Define and select window 1
-    static final byte UD2 = (byte) 0xEA; // Define and select window 2
-    static final byte UD3 = (byte) 0xEB; // Define and select window 3
-    static final byte UD4 = (byte) 0xEC; // Define and select window 4
-    static final byte UD5 = (byte) 0xED; // Define and select window 5
-    static final byte UD6 = (byte) 0xEE; // Define and select window 6
-    static final byte UD7 = (byte) 0xEF; // Define and select window 7
-
-    static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
-    static final byte UDX = (byte) 0xF1; // Define a Window as extended
-    static final byte Urs = (byte) 0xF2; // reserved
-
-    /** constant offsets for the 8 static windows */
-    static final int staticOffset[] =
-    {
-        0x0000, // ASCII for quoted tags
-        0x0080, // Latin - 1 Supplement (for access to punctuation)
-        0x0100, // Latin Extended-A
-        0x0300, // Combining Diacritical Marks
-        0x2000, // General Punctuation
-        0x2080, // Currency Symbols
-        0x2100, // Letterlike Symbols and Number Forms
-        0x3000  // CJK Symbols and punctuation
-    };
-
-    /** initial offsets for the 8 dynamic (sliding) windows */
-    static final int initialDynamicOffset[] =
-    {
-        0x0080, // Latin-1
-        0x00C0, // Latin Extended A   //@005 fixed from 0x0100
-        0x0400, // Cyrillic
-        0x0600, // Arabic
-        0x0900, // Devanagari
-        0x3040, // Hiragana
-        0x30A0, // Katakana
-        0xFF00  // Fullwidth ASCII
-    };
-
-    /** dynamic window offsets, intitialize to default values. */
-    int dynamicOffset[] =
-    {
-        initialDynamicOffset[0],
-        initialDynamicOffset[1],
-        initialDynamicOffset[2],
-        initialDynamicOffset[3],
-        initialDynamicOffset[4],
-        initialDynamicOffset[5],
-        initialDynamicOffset[6],
-        initialDynamicOffset[7]
-    };
-
-    // The following method is common to encoder and decoder
-
-    private int iWindow = 0;    // current active window
-
-    /** select the active dynamic window **/
-    protected void selectWindow(int iWindow)
-    {
-        this.iWindow = iWindow;
-    }
-
-    /** select the active dynamic window **/
-    protected int getCurrentWindow()
-    {
-        return this.iWindow;
-    }
-
-    /**
-       These values are used in defineWindow
-     **/
-
-    /**
-     * Unicode code points from 3400 to E000 are not adressible by
-     * dynamic window, since in these areas no short run alphabets are
-     * found. Therefore add gapOffset to all values from gapThreshold */
-    static final int gapThreshold = 0x68;
-    static final int gapOffset = 0xAC00;
-
-    /* values between reservedStart and fixedThreshold are reserved */
-    static final int reservedStart = 0xA8;
-
-    /* use table of predefined fixed offsets for values from fixedThreshold */
-    static final int fixedThreshold = 0xF9;
-
-    /** Table of fixed predefined Offsets, and byte values that index into  **/
-    static final int fixedOffset[] =
-    {
-        /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
-        /* 0xFA */ 0x0250, // IPA extensions
-        /* 0xFB */ 0x0370, // Greek
-        /* 0xFC */ 0x0530, // Armenian
-        /* 0xFD */ 0x3040, // Hiragana
-        /* 0xFE */ 0x30A0, // Katakana
-        /* 0xFF */ 0xFF60  // Halfwidth Katakana
-    };
-
-    /** whether a character is compressible */
-    public static boolean isCompressible(char ch)
-    {
-        return (ch < 0x3400 || ch >= 0xE000);
-    }
-
-    /** reset is only needed to bail out after an exception and
-        restart with new input */
-    public void reset()
-    {
-
-        // reset the dynamic windows
-        for (int i = 0; i < dynamicOffset.length; i++)
-        {
-            dynamicOffset[i] = initialDynamicOffset[i];
-        }
-        this.iWindow = 0;
-    }
-}
diff --git a/src/test/data/V2003/testUnicodeCompV2003.mdb b/src/test/data/V2003/testUnicodeCompV2003.mdb

new file mode 100644 (file)

index 0000000..ee7e12a

Binary files /dev/null and b/src/test/data/V2003/testUnicodeCompV2003.mdb differ
diff --git a/src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java b/src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java

index a25c6e81d9e4f23f861d7c6ca044ea41e5f66a5b..6f101fa38828c047424c6b74183b8689832cb204 100644 (file)
--- a/src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java
+++ b/src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java
@@ -1492,6 +1492,35 @@ public class DatabaseTest extends TestCase
      assertEquals("Row[1:1][{id=37,data=<null>}]", row.toString());
    }
  
+  public void testUnicodeCompression() throws Exception
+  {
+    File dbFile = new File("src/test/data/V2003/testUnicodeCompV2003.mdb");
+    Database db = open(Database.FileFormat.V2003, new File("src/test/data/V2003/testUnicodeCompV2003.mdb"));
+
+    StringBuilder sb = new StringBuilder(127);
+    for(int i = 1; i <= 0xFF; ++i) {
+      sb.append((char)i);
+    }
+
+    String[] expectedStrs = {
+      "only ascii chars",
+      "\u00E4\u00E4kk\u00F6si\u00E4",
+      "\u041C\u0438\u0440",
+      "\u03F0\u03B1\u1F76 \u03C4\u1F79\u03C4' \u1F10\u03B3\u1F7C \u039A\u1F7B\u03F0\u03BB\u03C9\u03C0\u03B1",
+      "\u6F22\u5B57\u4EEE\u540D\u4EA4\u3058\u308A\u6587",
+      "3L9\u001D52\u0002_AB(\u00A5\u0005!!V",
+      "\u00FCmlaut",
+      sb.toString()};
+
+    for(Row row : db.getTable("Table")) {
+      int id = (Integer)row.get("ID");
+      String str = (String)row.get("Unicode");
+      assertEquals(expectedStrs[id-1], str);
+    }
+
+    db.close();
+  }
+
    private void checkRawValue(String expected, Object val)
    {
      if(expected != null) {
@@ -1536,7 +1565,7 @@ public class DatabaseTest extends TestCase
    }
  
    static String createNonAsciiString(int len) {
-    return createString(len, '\u00C0');
+    return createString(len, '\u0CC0');
    }
      
    private static String createString(int len, char firstChar) {
diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressMain.java b/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressMain.java

deleted file mode 100644 (file)

index 52b9e86..0000000
--- a/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressMain.java
+++ /dev/null
@@ -1,574 +0,0 @@
-package com.healthmarketscience.jackcess.impl.scsu;
-
-import java.io.*;
-import java.util.*;
-
-/**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- *  @author Asmus Freytag
- *
- *  @version 001 Dec 25 1996
- *  @version 002 Jun 25 1997
- *  @version 003 Jul 25 1997
- *  @version 004 Aug 25 1997
- *  @version 005 Sep 30 1998
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
-/**
-       Class CompressMain
-
-       A small commandline driver interface for the compression routines
-       Use the /? to get usage
-*/
-public class CompressMain
-{
-       static void usage()
-       {
-               System.err.println("java CompressMain /?               : this usage information\n");
-               System.err.println("java CompressMain /random              : random test\n");
-               System.err.println("java CompressMain /suite           : suite test\n");
-               System.err.println("java CompressMain /suite <file>    : file test (file data may include \\uXXXX)\n");
-               System.err.println("java CompressMain <string>             : string test (string may include \\uXXXX)\n");
-               System.err.println("java CompressMain /roundtrip <file>: check Unicode file for roundtrip\n");
-               System.err.println("java CompressMain /compress <file> : compresses Unicode files (no \\uXXXX)\n");
-               System.err.println("java CompressMain /expand <file>   : expands into Unicode files\n");
-               System.err.println("java CompressMain /byteswap <files>: swaps byte order of Unicode files\n");
-               System.err.println("java CompressMain /display <files> : like expand, but creates a dump instead\n");
-               System.err.println("java CompressMain /parse <files>   : parses \\uXXXX into binary Unicode\n");
-       }
-
-    static void analyze(String text, int inlength, String result, int outlength)
-    {
-        boolean fSuccess = text.equals(result);
-        Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED");
-        if (!fSuccess && result != null)
-        {
-            int iLim = Math.min(text.length(), result.length());
-            for (int i = 0; i < iLim; i++)
-            {
-                if (text.charAt(i) != result.charAt(i))
-                {
-                    Debug.out("First Mismatch at  "+ i +"=", result.charAt(i) );
-                    Debug.out("Original character "+ i +"=", text.charAt(i) );
-                    break;
-                }
-            }
-        }
-        else
-        {
-            Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes.");
-            Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
-        }
-    }
-
-    static void test2(String text)
-    {
-        byte bytes[] = null;
-        String result = null;
-        Debug.out("SCSU:\n");
-        Compress compressor = new Compress();
-        try
-        {
-            bytes = compressor.compress(text);
-            Expand display = new Expand();
-            result = display.expand(bytes);
-            Debug.out("Input:  ", text.toCharArray());
-            Debug.out("Result: ", result.toCharArray());
-            Debug.out("");
-            Expand expander = new Expand();
-            result = expander.expand(bytes);
-        }
-        catch (Exception e)
-        {
-            System.out.println(e);
-        }
-        int inlength = compressor.charsRead();
-        int outlength = compressor.bytesWritten();
-        analyze(text, inlength, result, outlength);
-    }
-
-    static void test(String text) throws Exception
-    {
-      test(text, false);
-    }
-
-  static void test(String text, boolean shouldFail)
-      throws Exception
-    {
-        // Create an instance of the compressor
-        Compress compressor = new Compress();
-
-        byte [] bytes = null;
-        String result = null;
-        Exception failure = null;
-        try {
-            // perform compression
-            bytes = compressor.compress(text);
-        }
-        catch(Exception e)
-        {
-            failure = e;
-        }
-
-        if(shouldFail) {
-          if(failure == null) {
-            throw new RuntimeException("Did not fail");
-          }
-          return;
-        }
-
-        if(failure != null) {
-          throw failure;
-        }
-
-        Expand expander = new Expand();
-        // perform expansion
-        result = expander.expand(bytes);
-
-        // analyze the results
-        int inlength = compressor.charsRead();
-        int outlength = compressor.bytesWritten();
-        analyze(text, inlength, result, outlength);
-
-    }
-
-    public static void display(byte [] input)
-    {
-        try
-        {
-            Expand expand = new Expand();
-            String text = expand.expand(input);
-            Debug.out(text.toCharArray());
-        }
-        catch (Exception e)
-        {
-            System.out.println(e);
-        }
-    }
-
-    public static String parse(String input)
-    {
-        StringTokenizer st = new StringTokenizer(input, "\\", true);
-        Debug.out("Input: ", input);
-
-        StringBuffer sb = new StringBuffer();
-
-        while(st.hasMoreTokens())
-        {
-            String token = st.nextToken();
-                    Debug.out("Token: ", token);
-            if (token.charAt(0) == '\\' && token.length() == 1)
-            {
-                if(st.hasMoreTokens())
-                {
-                    token = st.nextToken();
-                }
-                if(token.charAt(0) == 'u')
-                {
-                    Debug.out("Token: "+ token+ " ", sb.toString());
-                    String hexnum;
-                    if (token.length() > 5)
-                    {
-                        hexnum = token.substring(1,5);
-                        token = token.substring(5);
-                    }
-                    else
-                    {
-                        hexnum = token.substring(1);
-                        token = "";
-                    }
-                    sb.append((char)Integer.parseInt(hexnum, 16));
-                }
-            }
-            sb.append(token);
-        }
-        return sb.toString();
-    }
-
-    public static void randomTest(int nTest)
-      throws Exception
-    {
-        Random random = new Random();
-
-        for(int n=0; n < nTest; n++)
-        {
-            int iLen = (int) (20 * random.nextFloat());
-            StringBuffer sb = new StringBuffer(iLen);
-
-            for(int i = 0; i < iLen; i++)
-            {
-                sb.append((char) (0xFFFF * random.nextFloat()));
-            }
-
-            test(sb.toString());
-        }
-    }
-
-    @SuppressWarnings("deprecation")
-    public static void fileTest(String name)
-        throws Exception
-    {
-        DataInputStream dis = new DataInputStream(new FileInputStream(name));
-
-        int iLine = 0;
-
-        while(dis.available() != 0)
-        {
-            String line = dis.readLine();
-            Debug.out("Line "+ iLine++ +" "+line);
-            test(parse(line), false ); //false);// initially no debug info
-        }
-    }
-
-    public static void displayFile(String name)
-            throws IOException
-    {
-        DataInputStream dis = new DataInputStream(new FileInputStream(name));
-
-        byte bytes[] = new byte[dis.available()];
-        dis.read(bytes);
-        display(bytes);
-    }
-
-    public static void decodeTest(String name)
-           throws IOException
-    {
-        DataInputStream dis = new DataInputStream(new FileInputStream(name));
-
-        byte bytes[] = new byte[dis.available()];
-        dis.read(bytes);
-
-        Expand expand = new Expand();
-
-        char [] chars = null;
-        try
-        {
-            String text = expand.expand(bytes);
-            chars = text.toCharArray();
-        }
-        catch (Exception e)
-        {
-            System.out.println(e);
-        }
-        int inlength = expand.bytesRead();
-        int iDot = name.lastIndexOf('.');
-        StringBuffer sb = new StringBuffer(name);
-        sb.setLength(iDot + 1);
-        sb.append("txt");
-        String outName = sb.toString();
-
-        int outlength = expand.charsWritten();
-
-        Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%.");
-
-        if (chars == null)
-            return;
-
-        writeUnicodeFile(outName, chars);
-    }
-
-    /** most of the next 3 functions should not be needed by JDK11 and later */
-    private static int iMSB = 1;
-
-    public static String readUnicodeFile(String name)
-    {
-        try
-        {
-            FileInputStream dis = new FileInputStream(name);
-
-            byte b[] = new byte[2];
-            StringBuffer sb = new StringBuffer();
-            char ch = 0;
-
-            iMSB = 1;
-            int i = 0;
-            for(i = 0; (dis.available() != 0); i++)
-            {
-                b[i%2] = (byte) dis.read();
-
-                if ((i & 1) == 1)
-                {
-                    ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]);
-                }
-                else
-                {
-                    continue;
-                }
-                if (i == 1 && ch == '\uFEFF')
-                    continue; // throw away byte order mark
-
-                if (i == 1 && ch == '\uFFFE')
-                {
-                    iMSB ++;  // flip byte order
-                    continue; // throw away byte order mark
-                }
-                sb.append(ch);
-             }
-
-            return sb.toString();
-        }
-        catch (IOException e)
-        {
-            System.err.println(e);
-            return "";
-        }
-    }
-
-    public static void writeUnicodeFile(String outName, char [] chars)
-            throws IOException
-    {
-        DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
-        if ((iMSB & 1) == 1)
-        {
-            dos.writeByte(0xFF);
-            dos.writeByte(0xFE);
-        }
-        else
-        {
-            dos.writeByte(0xFE);
-            dos.writeByte(0xFF);
-        }
-        byte b[] = new byte[2];
-        for (int ich = 0; ich < chars.length; ich++)
-        {
-            b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8);
-            b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF);
-            dos.write(b, 0, 2);
-        }
-    }
-
-    static void byteswap(String name)
-        throws IOException
-    {
-        String text = readUnicodeFile(name);
-        char chars[] = text.toCharArray();
-        writeUnicodeFile(name, chars);
-    }
-
-    @SuppressWarnings("deprecation")
-    public static void parseFile(String name)
-        throws IOException
-    {
-        DataInputStream dis = new DataInputStream(new FileInputStream(name));
-
-        byte bytes[] = new byte[dis.available()];
-        dis.read(bytes);
-
-        // simplistic test
-        int bom = (char) bytes[0] + (char) bytes[1];
-        if (bom == 131069)
-        {
-            // FEFF or FFFE detected (either one sums to 131069)
-            Debug.out(name + " is already in Unicode!");
-            return;
-        }
-
-        // definitely assumes an ASCII file at this point
-        String text = new String(bytes, 0);
-
-        char chars[] = parse(text).toCharArray();
-        writeUnicodeFile(name, chars);
-        return;
-    }
-
-    public static void encodeTest(String name)
-        throws Exception
-    {
-        String text = readUnicodeFile(name);
-
-        // Create an instance of the compressor
-        Compress compressor = new Compress();
-
-        byte [] bytes = null;
-
-        // perform compression
-        bytes = compressor.compress(text);
-
-        int inlength = compressor.charsRead();
-        int iDot = name.lastIndexOf('.');
-        StringBuffer sb = new StringBuffer(name);
-        sb.setLength(iDot + 1);
-        sb.append("csu");
-        String outName = sb.toString();
-
-        DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
-        dos.write(bytes, 0, bytes.length);
-
-        int outlength = compressor.bytesWritten();
-
-        Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
-    }
-
-    public static void roundtripTest(String name)
-      throws Exception
-    {
-      test(readUnicodeFile(name), false);// no debug info
-    }
-
-    /** The Main function */
-    public static void main(String args[])
-      throws Exception
-    {
-        int iArg = args.length;
-
-        try
-        {
-            if (iArg != 0)
-            {
-                if (args[0].equalsIgnoreCase("/compress"))
-                {
-                    while (--iArg > 0)
-                    {
-                        encodeTest(args[args.length - iArg]);
-                    }
-                }
-                else if (args[0].equalsIgnoreCase("/parse"))
-                {
-                    while (--iArg > 0)
-                    {
-                        parseFile(args[args.length - iArg]);
-                    }
-                }
-                else if (args[0].equalsIgnoreCase("/expand"))
-                {
-                    while (--iArg > 0)
-                    {
-                        decodeTest(args[args.length - iArg]);
-                    }
-                }
-                else if (args[0].equalsIgnoreCase("/display"))
-                {
-                    while (--iArg > 0)
-                    {
-                        displayFile(args[args.length - iArg]);
-                    }
-                }
-                else if (args[0].equalsIgnoreCase("/roundtrip"))
-                {
-                    while (--iArg > 0)
-                    {
-                        roundtripTest(args[args.length - iArg]);
-                    }
-                }
-                else if (args[0].equalsIgnoreCase("/byteswap"))
-                {
-                    while (--iArg > 0)
-                    {
-                        byteswap(args[args.length - iArg]);
-                    }
-                }else if (args[0].equalsIgnoreCase("/random"))
-                {
-                    randomTest(8);
-                }
-                else if (args[0].equalsIgnoreCase("/suite"))
-                {
-                    if (iArg == 1)
-                    {
-                        suiteTest();
-                    }
-                    else
-                    {
-                        while (--iArg > 0)
-                        {
-                            fileTest(args[args.length - iArg]);
-                        }
-                    }
-                }
-                       else if (args[0].equalsIgnoreCase("/?"))
-                       {
-                               usage();
-                       }
-                else
-                {
-                    while (iArg > 0)
-                    {
-                        test2(parse(args[--iArg]));
-                    }
-                }
-            }
-            else
-            {
-                usage();
-            }
-        }
-        catch (IOException e)
-        {
-            System.err.println(e);
-        }
-        try
-        {
-            System.err.println("Done. Press enter to exit");
-            System.in.read();
-        }
-        catch (IOException e)
-        {
-
-        }
-    }
-
-    static void suiteTest()
-      throws Exception
-    {
-        Debug.out("Standard Compression test suite:");
-        test("Hello \u9292 \u9192 World!");
-        test("Hell\u0429o \u9292 \u9192 W\u00e4rld!");
-        test("Hell\u0429o \u9292 \u9292W\u00e4rld!");
-
-        test("\u0648\u06c8"); // catch missing reset
-        test("\u0648\u06c8");
-
-        test("\u4444\uE001"); // lowest quotable
-        test("\u4444\uf2FF"); // highest quotable
-        test("\u4444\uf188\u4444");
-        test("\u4444\uf188\uf288");
-        test("\u4444\uf188abc\0429\uf288");
-        test("\u9292\u2222");
-        test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!");
-        test("Hell\u0429o \u9292 \u9292W\u00e4rld!");
-        test("Hello World!123456");
-        test("Hello W\u0081\u011f\u0082!"); // Latin 1 run
-
-        test("abc\u0301\u0302");  // uses SQn for u301 u302
-        test("abc\u4411d");      // uses SQU
-        test("abc\u4411\u4412d");// uses SCU
-        test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5
-        test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data
-        test("\u9292\u2222");
-        test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000");
-        test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c");
-        test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002");
-
-        test(""); // empty input
-        test("\u0000"); // smallest BMP character
-        test("\uFFFF"); // largest BMP character
-
-        test("\ud800\udc00"); // smallest surrogate
-        test("\ud8ff\udcff"); // largest surrogate pair
-
-
-        Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:");
-        test("\ud800 \udc00", true); // unpaired surrogate (1)
-        test("\udc00", true); // unpaired surrogate (2)
-        test("\ud800", true); // unpaired surrogate (3)
-   }
-}
diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressTest.java b/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressTest.java

deleted file mode 100644 (file)

index b9dc13a..0000000
--- a/src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressTest.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
-Copyright (c) 2007 Health Market Science, Inc.
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
-USA
-
-You can contact Health Market Science at info@healthmarketscience.com
-or at the following address:
-
-Health Market Science
-2700 Horizon Drive
-Suite 200
-King of Prussia, PA 19406
-*/
-
-package com.healthmarketscience.jackcess.impl.scsu;
-
-import junit.framework.TestCase;
-
-/**
- * @author James Ahlborn
- */
-public class CompressTest extends TestCase
-{
-
-  public CompressTest(String name) throws Exception {
-    super(name);
-  }
-
-  public void testCompression() throws Exception
-  {
-    CompressMain.suiteTest();
-  }
-
-}
author	James Ahlborn <jtahlborn@yahoo.com>
	Sat, 15 Nov 2014 04:06:17 +0000 (04:06 +0000)
committer	James Ahlborn <jtahlborn@yahoo.com>
	Sat, 15 Nov 2014 04:06:17 +0000 (04:06 +0000)
pom.xml		patch \| blob \| history
src/changes/changes.xml		patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/ColumnImpl.java		patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/scsu/Compress.java	[deleted file]	patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/scsu/Debug.java	[deleted file]	patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfInputException.java	[deleted file]	patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/scsu/EndOfOutputException.java	[deleted file]	patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/scsu/Expand.java	[deleted file]	patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/scsu/IllegalInputException.java	[deleted file]	patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/scsu/SCSU.java	[deleted file]	patch \| blob \| history
src/test/data/V2003/testUnicodeCompV2003.mdb	[new file with mode: 0644]	patch \| blob
src/test/java/com/healthmarketscience/jackcess/DatabaseTest.java		patch \| blob \| history
src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressMain.java	[deleted file]	patch \| blob \| history
src/test/java/com/healthmarketscience/jackcess/impl/scsu/CompressTest.java	[deleted file]	patch \| blob \| history