summaryrefslogtreecommitdiffstats
path: root/src/java
diff options
context:
space:
mode:
Diffstat (limited to 'src/java')
-rw-r--r--src/java/com/healthmarketscience/jackcess/Index.java301
-rw-r--r--src/java/com/healthmarketscience/jackcess/IndexCodes.java710
2 files changed, 560 insertions, 451 deletions
diff --git a/src/java/com/healthmarketscience/jackcess/Index.java b/src/java/com/healthmarketscience/jackcess/Index.java
index 2da1fc2..3bf9c8d 100644
--- a/src/java/com/healthmarketscience/jackcess/Index.java
+++ b/src/java/com/healthmarketscience/jackcess/Index.java
@@ -35,8 +35,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
-import java.util.Iterator;
-import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@@ -1045,110 +1043,96 @@ public abstract class Index implements Comparable<Index> {
}
// now, convert each character to a "code" of one or more bytes
- List<ExtraCodes> unprintableCodes = null;
- List<ExtraCodes> internationalCodes = null;
+ ExtraCodesOutputStream extraCodes = null;
+ ExtraCodesOutputStream unprintableCodes = null;
+ ExtraCodesOutputStream crazyCodes = null;
int charOffset = 0;
for(int i = 0; i < str.length(); ++i) {
+
char c = str.charAt(i);
- Character cKey = c;
+ CharHandler ch = getCharHandler(c);
- byte[] bytes = CODES.get(cKey);
+ int curCharOffset = charOffset;
+ byte[] bytes = ch.getInlineBytes();
if(bytes != null) {
- // simple case, write the codes we found
+ // write the "inline" codes immediately
tmpBout.write(bytes);
+
+ // only increment the charOffset for chars with inline codes
++charOffset;
- continue;
}
- bytes = UNPRINTABLE_CODES.get(cKey);
- if(bytes != null) {
- // we do not write anything to tmpBout
- if(bytes.length > 0) {
- if(unprintableCodes == null) {
- unprintableCodes = new LinkedList<ExtraCodes>();
- }
-
- // keep track of the extra codes for later
- unprintableCodes.add(new ExtraCodes(charOffset, bytes));
- }
-
- // note, we do _not_ increment the charOffset for unprintable chars
+ if(ch.getType() == Type.SIMPLE) {
+ // common case, skip further code handling
continue;
}
- InternationalCodes inatCodes = INTERNATIONAL_CODES.get(cKey);
- if(inatCodes != null) {
-
- // we write the "inline" portion of the international codes
- // immediately, and queue the extra codes for later
- tmpBout.write(inatCodes._inlineCodes);
-
- if(internationalCodes == null) {
- internationalCodes = new LinkedList<ExtraCodes>();
+ bytes = ch.getExtraBytes();
+ byte extraCodeModifier = ch.getExtraByteModifier();
+ if((bytes != null) || (extraCodeModifier != 0)) {
+ if(extraCodes == null) {
+ extraCodes = new ExtraCodesOutputStream(str.length());
}
// keep track of the extra codes for later
- internationalCodes.add(new ExtraCodes(charOffset,
- inatCodes._extraCodes));
+ writeExtraCodes(curCharOffset, bytes, extraCodeModifier, extraCodes);
+ }
- ++charOffset;
- continue;
+ bytes = ch.getUnprintableBytes();
+ if(bytes != null) {
+ if(unprintableCodes == null) {
+ unprintableCodes = new ExtraCodesOutputStream();
+ }
+
+ // keep track of the unprintable codes for later
+ writeUnprintableCodes(curCharOffset, bytes, unprintableCodes);
}
+
+ byte crazyFlag = ch.getCrazyFlag();
+ if(crazyFlag != 0) {
+ if(crazyCodes == null) {
+ crazyCodes = new ExtraCodesOutputStream();
+ }
- // bummer, out of luck
- throw new IOException("unmapped string index value " + c);
+ // keep track of the crazy flags for later
+ crazyCodes.write(crazyFlag);
+ }
}
// write end text flag
tmpBout.write(END_TEXT);
- boolean hasExtraText = ((unprintableCodes != null) ||
- (internationalCodes != null));
- if(hasExtraText) {
+ boolean hasExtraCodes = trimExtraCodes(
+ extraCodes, (byte)0, INTERNATIONAL_EXTRA_PLACEHOLDER);
+ boolean hasUnprintableCodes = (unprintableCodes != null);
+ boolean hasCrazyCodes = (crazyCodes != null);
+ if(hasExtraCodes || hasUnprintableCodes || hasCrazyCodes) {
// we write all the international extra bytes first
- if(internationalCodes != null) {
-
- // we write a placeholder char for each non-international char before
- // the extra chars for the international char
- charOffset = 0;
- Iterator<ExtraCodes> iter = internationalCodes.iterator();
- while(iter.hasNext()) {
- ExtraCodes extraCodes = iter.next();
- while(charOffset < extraCodes._charOffset) {
- tmpBout.write(INTERNATIONAL_EXTRA_PLACEHOLDER);
- ++charOffset;
- }
- tmpBout.write(extraCodes._extraCodes);
- ++charOffset;
- }
+ if(hasExtraCodes) {
+ extraCodes.writeTo(tmpBout);
}
- // then we write all the unprintable extra bytes
- if(unprintableCodes != null) {
+ if(hasCrazyCodes || hasUnprintableCodes) {
+
+ // write 2 more end flags
+ tmpBout.write(END_TEXT);
+ tmpBout.write(END_TEXT);
- // write a single prefix for all unprintable chars
- tmpBout.write(UNPRINTABLE_COMMON_PREFIX);
+ // next come the crazy flags
+ if(hasCrazyCodes) {
+ writeCrazyCodes(crazyCodes, tmpBout);
+ }
+
+ // then we write all the unprintable extra bytes
+ if(hasUnprintableCodes) {
+
+ // write another end flag
+ tmpBout.write(END_TEXT);
- // we write a whacky combo of bytes for each unprintable char which
- // includes a funky offset and extra char itself
- Iterator<ExtraCodes> iter = unprintableCodes.iterator();
- while(iter.hasNext()) {
- ExtraCodes extraCodes = iter.next();
- int offset =
- (UNPRINTABLE_COUNT_START +
- (UNPRINTABLE_COUNT_MULTIPLIER * extraCodes._charOffset))
- | UNPRINTABLE_OFFSET_FLAGS;
-
- // write offset as big-endian short
- tmpBout.write((offset >> 8) & 0xFF);
- tmpBout.write(offset & 0xFF);
-
- tmpBout.write(UNPRINTABLE_MIDFIX);
- tmpBout.write(extraCodes._extraCodes);
+ unprintableCodes.writeTo(tmpBout);
}
}
-
}
// handle descending order by inverting the bytes
@@ -1168,6 +1152,111 @@ public abstract class Index implements Comparable<Index> {
bout.write(END_EXTRA_TEXT);
}
+ private static void writeExtraCodes(
+ int charOffset, byte[] bytes, byte extraCodeModifier,
+ ExtraCodesOutputStream extraCodes)
+ throws IOException
+ {
+ // we fill in a placeholder value for any chars w/out extra codes
+ int numChars = extraCodes.getNumChars();
+ if(numChars < charOffset) {
+ int fillChars = charOffset - numChars;
+ extraCodes.write(fillChars, INTERNATIONAL_EXTRA_PLACEHOLDER);
+ extraCodes.incrementNumChars(fillChars);
+ }
+
+ if(bytes != null) {
+
+ // write the actual extra codes and update the number of chars
+ extraCodes.write(bytes);
+ extraCodes.incrementNumChars(1);
+
+ } else {
+
+ // the extra code modifier is added to the last extra code written. if
+ // there is no previous extra code, it is made the first extra code.
+ int lastIdx = extraCodes.size() - 1;
+ if(lastIdx >= 0) {
+ byte lastByte = extraCodes.get(lastIdx);
+ lastByte += extraCodeModifier;
+ extraCodes.set(lastIdx, lastByte);
+ } else {
+ extraCodes.write(extraCodeModifier);
+ }
+ }
+ }
+
+ private static boolean trimExtraCodes(ExtraCodesOutputStream extraCodes,
+ byte minTrimCode, byte maxTrimCode)
+ throws IOException
+ {
+ if(extraCodes == null) {
+ return false;
+ }
+
+ extraCodes.trimTrailingBytes(minTrimCode, maxTrimCode);
+
+ // anything left?
+ return (extraCodes.size() > 0);
+ }
+
+ private static void writeUnprintableCodes(
+ int charOffset, byte[] bytes, ExtraCodesOutputStream extraCodes)
+ throws IOException
+ {
+ // we write a whacky combo of bytes for each unprintable char which
+ // includes a funky offset and extra char itself
+ int offset =
+ (UNPRINTABLE_COUNT_START +
+ (UNPRINTABLE_COUNT_MULTIPLIER * charOffset))
+ | UNPRINTABLE_OFFSET_FLAGS;
+
+ // write offset as big-endian short
+ extraCodes.write((offset >> 8) & 0xFF);
+ extraCodes.write(offset & 0xFF);
+
+ extraCodes.write(UNPRINTABLE_MIDFIX);
+ extraCodes.write(bytes);
+ }
+
+ private static void writeCrazyCodes(ExtraCodesOutputStream crazyCodes,
+ ByteArrayOutputStream tmpBout)
+ throws IOException
+ {
+ // CRAZY_CODE_2 flags at the end are ignored, so ditch them
+ trimExtraCodes(crazyCodes, CRAZY_CODE_2, CRAZY_CODE_2);
+
+ if(crazyCodes.size() > 0) {
+
+ // the crazy codes get encoded into 6 bit sequences where each code is 2
+ // bits (where the first 2 bits in the byte are a common prefix).
+ byte curByte = CRAZY_CODE_START;
+ int idx = 0;
+ for(int i = 0; i < crazyCodes.size(); ++i) {
+ byte nextByte = crazyCodes.get(i);
+ nextByte <<= ((2 - idx) * 2);
+ curByte |= nextByte;
+
+ ++idx;
+ if(idx == 3) {
+ // write current byte and reset
+ tmpBout.write(curByte);
+ curByte = CRAZY_CODE_START;
+ idx = 0;
+ }
+ }
+
+ // write last byte
+ if(idx > 0) {
+ tmpBout.write(curByte);
+ }
+ }
+
+ // write crazy code suffix (note, we write this even if all the codes are
+ // trmmed
+ tmpBout.write(CRAZY_CODES_SUFFIX);
+ }
+
/**
* Creates one of the special index entries.
*/
@@ -2289,17 +2378,63 @@ public abstract class Index implements Comparable<Index> {
entries);
}
}
-
+
+
/**
- * Value struct used during Text index entry encoding.
+ * Extension of ByteArrayOutputStream which allows more complex access to
+ * the underlying bytes, as well as keeps track of an additional char count.
*/
- private static final class ExtraCodes {
- public final int _charOffset;
- public final byte[] _extraCodes;
+ private static final class ExtraCodesOutputStream
+ extends ByteArrayOutputStream
+ {
+ private int numChars;
+
+ private ExtraCodesOutputStream() {
+ super();
+ }
+
+ private ExtraCodesOutputStream(int length) {
+ super(length);
+ }
+
+ public int getNumChars() {
+ return numChars;
+ }
+
+ public byte get(int offset) {
+ return buf[offset];
+ }
+
+ public void set(int offset, byte b) {
+ buf[offset] = b;
+ }
+
+ public void write(int numBytes, byte b) {
+ for(int i = 0; i < numBytes; ++i) {
+ write(b);
+ }
+ }
+
+ public void incrementNumChars(int inc) {
+ numChars += inc;
+ }
+
+ public void trimTrailingBytes(byte minTrimCode, byte maxTrimCode)
+ {
+ int minTrim = ByteUtil.asUnsignedByte(minTrimCode);
+ int maxTrim = ByteUtil.asUnsignedByte(maxTrimCode);
+
+ int idx = count - 1;
+ while(idx >= 0) {
+ int val = ByteUtil.asUnsignedByte(get(idx));
+ if((val >= minTrim) && (val <= maxTrim)) {
+ --idx;
+ } else {
+ break;
+ }
+ }
- private ExtraCodes(int charOffset, byte[] extraCodes) {
- _charOffset = charOffset;
- _extraCodes = extraCodes;
+ count = idx + 1;
}
}
diff --git a/src/java/com/healthmarketscience/jackcess/IndexCodes.java b/src/java/com/healthmarketscience/jackcess/IndexCodes.java
index 88aa37c..3ae736e 100644
--- a/src/java/com/healthmarketscience/jackcess/IndexCodes.java
+++ b/src/java/com/healthmarketscience/jackcess/IndexCodes.java
@@ -27,6 +27,10 @@ King of Prussia, PA 19406
package com.healthmarketscience.jackcess;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
@@ -54,7 +58,7 @@ public class IndexCodes {
static final byte DESC_BOOLEAN_TRUE = ASC_BOOLEAN_FALSE;
static final byte DESC_BOOLEAN_FALSE = ASC_BOOLEAN_TRUE;
-
+
// unprintable char is removed from normal text.
// pattern for unprintable chars in the extra bytes:
@@ -63,378 +67,358 @@ public class IndexCodes {
// <code> = char code
static final int UNPRINTABLE_COUNT_START = 7;
static final int UNPRINTABLE_COUNT_MULTIPLIER = 4;
- static final byte[] UNPRINTABLE_COMMON_PREFIX =
- new byte[]{(byte)0x01, (byte)0x01, (byte)0x01};
static final int UNPRINTABLE_OFFSET_FLAGS = 0x8000;
static final byte UNPRINTABLE_MIDFIX = (byte)0x06;
// international char is replaced with ascii char.
// pattern for international chars in the extra bytes:
// [ 02 (for each normal char) ] [ <symbol_code> (for each inat char) ]
- static final byte INTERNATIONAL_EXTRA_PLACEHOLDER = (byte)0x02;
-
- /**
- * Map of character to byte[] that Access uses in indexes (not ASCII)
- * (Character -> byte[]) as codes to order text
- */
- static final Map<Character, byte[]> CODES =
- new HashMap<Character, byte[]>(150);
-
- /**
- * Map of character to byte[] that Access uses in indexes for unprintable
- * characters (not ASCII) (Character -> byte[]), in the extended portion
- */
- static final Map<Character, byte[]> UNPRINTABLE_CODES =
- new HashMap<Character, byte[]>(100);
-
- /**
- * Map of character to byte[] that Access uses in indexes for international
- * characters (not ASCII) (Character -> InternationalCodes), in the extended
- * portion
- */
- static final Map<Character, InternationalCodes> INTERNATIONAL_CODES =
- new HashMap<Character, InternationalCodes>(70);
-
- static {
-
- registerCodes('\u0000', new byte[]{});
- registerCodes('\t', new byte[]{(byte)0x08, (byte)0x03});
- registerCodes('\n', new byte[]{(byte)0x08, (byte)0x04});
- registerCodes('\u000B', new byte[]{(byte)0x08, (byte)0x05});
- registerCodes('\f', new byte[]{(byte)0x08, (byte)0x06});
- registerCodes('\r', new byte[]{(byte)0x08, (byte)0x07});
- registerCodes('\u0020', new byte[]{(byte)0x07});
- registerCodes('\u0021', new byte[]{(byte)0x09});
- registerCodes('\"', new byte[]{(byte)0x0A});
- registerCodes('\u0023', new byte[]{(byte)0x0C});
- registerCodes('\u0024', new byte[]{(byte)0x0E});
- registerCodes('\u0025', new byte[]{(byte)0x10});
- registerCodes('\u0026', new byte[]{(byte)0x12});
- registerCodes('\u0028', new byte[]{(byte)0x14});
- registerCodes('\u0029', new byte[]{(byte)0x16});
- registerCodes('\u002A', new byte[]{(byte)0x18});
- registerCodes('\u002B', new byte[]{(byte)0x2C});
- registerCodes('\u002C', new byte[]{(byte)0x1A});
- registerCodes('\u002E', new byte[]{(byte)0x1C});
- registerCodes('\u002F', new byte[]{(byte)0x1E});
- registerCodes('\u0030', new byte[]{(byte)0x36});
- registerCodes('\u0031', new byte[]{(byte)0x38});
- registerCodes('\u0032', new byte[]{(byte)0x3A});
- registerCodes('\u0033', new byte[]{(byte)0x3C});
- registerCodes('\u0034', new byte[]{(byte)0x3E});
- registerCodes('\u0035', new byte[]{(byte)0x40});
- registerCodes('\u0036', new byte[]{(byte)0x42});
- registerCodes('\u0037', new byte[]{(byte)0x44});
- registerCodes('\u0038', new byte[]{(byte)0x46});
- registerCodes('\u0039', new byte[]{(byte)0x48});
- registerCodes('\u003A', new byte[]{(byte)0x20});
- registerCodes('\u003B', new byte[]{(byte)0x22});
- registerCodes('\u003C', new byte[]{(byte)0x2E});
- registerCodes('\u003D', new byte[]{(byte)0x30});
- registerCodes('\u003E', new byte[]{(byte)0x32});
- registerCodes('\u003F', new byte[]{(byte)0x24});
- registerCodes('\u0040', new byte[]{(byte)0x26});
- registerCodes('\u0041', new byte[]{(byte)0x4A});
- registerCodes('\u0042', new byte[]{(byte)0x4C});
- registerCodes('\u0043', new byte[]{(byte)0x4D});
- registerCodes('\u0044', new byte[]{(byte)0x4F});
- registerCodes('\u0045', new byte[]{(byte)0x51});
- registerCodes('\u0046', new byte[]{(byte)0x53});
- registerCodes('\u0047', new byte[]{(byte)0x55});
- registerCodes('\u0048', new byte[]{(byte)0x57});
- registerCodes('\u0049', new byte[]{(byte)0x59});
- registerCodes('\u004A', new byte[]{(byte)0x5B});
- registerCodes('\u004B', new byte[]{(byte)0x5C});
- registerCodes('\u004C', new byte[]{(byte)0x5E});
- registerCodes('\u004D', new byte[]{(byte)0x60});
- registerCodes('\u004E', new byte[]{(byte)0x62});
- registerCodes('\u004F', new byte[]{(byte)0x64});
- registerCodes('\u0050', new byte[]{(byte)0x66});
- registerCodes('\u0051', new byte[]{(byte)0x68});
- registerCodes('\u0052', new byte[]{(byte)0x69});
- registerCodes('\u0053', new byte[]{(byte)0x6B});
- registerCodes('\u0054', new byte[]{(byte)0x6D});
- registerCodes('\u0055', new byte[]{(byte)0x6F});
- registerCodes('\u0056', new byte[]{(byte)0x71});
- registerCodes('\u0057', new byte[]{(byte)0x73});
- registerCodes('\u0058', new byte[]{(byte)0x75});
- registerCodes('\u0059', new byte[]{(byte)0x76});
- registerCodes('\u005A', new byte[]{(byte)0x78});
- registerCodes('\u005B', new byte[]{(byte)0x27});
- registerCodes('\\', new byte[]{(byte)0x29});
- registerCodes('\u005D', new byte[]{(byte)0x2A});
- registerCodes('\u005E', new byte[]{(byte)0x2B, (byte)0x02});
- registerCodes('\u005F', new byte[]{(byte)0x2B, (byte)0x03});
- registerCodes('\u0060', new byte[]{(byte)0x2B, (byte)0x07});
- registerCodes('\u0061', new byte[]{(byte)0x4A});
- registerCodes('\u0062', new byte[]{(byte)0x4C});
- registerCodes('\u0063', new byte[]{(byte)0x4D});
- registerCodes('\u0064', new byte[]{(byte)0x4F});
- registerCodes('\u0065', new byte[]{(byte)0x51});
- registerCodes('\u0066', new byte[]{(byte)0x53});
- registerCodes('\u0067', new byte[]{(byte)0x55});
- registerCodes('\u0068', new byte[]{(byte)0x57});
- registerCodes('\u0069', new byte[]{(byte)0x59});
- registerCodes('\u006A', new byte[]{(byte)0x5B});
- registerCodes('\u006B', new byte[]{(byte)0x5C});
- registerCodes('\u006C', new byte[]{(byte)0x5E});
- registerCodes('\u006D', new byte[]{(byte)0x60});
- registerCodes('\u006E', new byte[]{(byte)0x62});
- registerCodes('\u006F', new byte[]{(byte)0x64});
- registerCodes('\u0070', new byte[]{(byte)0x66});
- registerCodes('\u0071', new byte[]{(byte)0x68});
- registerCodes('\u0072', new byte[]{(byte)0x69});
- registerCodes('\u0073', new byte[]{(byte)0x6B});
- registerCodes('\u0074', new byte[]{(byte)0x6D});
- registerCodes('\u0075', new byte[]{(byte)0x6F});
- registerCodes('\u0076', new byte[]{(byte)0x71});
- registerCodes('\u0077', new byte[]{(byte)0x73});
- registerCodes('\u0078', new byte[]{(byte)0x75});
- registerCodes('\u0079', new byte[]{(byte)0x76});
- registerCodes('\u007A', new byte[]{(byte)0x78});
- registerCodes('\u007B', new byte[]{(byte)0x2B, (byte)0x09});
- registerCodes('\u007C', new byte[]{(byte)0x2B, (byte)0x0B});
- registerCodes('\u007D', new byte[]{(byte)0x2B, (byte)0x0D});
- registerCodes('\u007E', new byte[]{(byte)0x2B, (byte)0x0F});
- registerCodes('\u00A0', new byte[]{(byte)0x08, (byte)0x02});
- registerCodes('\u00A1', new byte[]{(byte)0x2B, (byte)0x10});
- registerCodes('\u00A2', new byte[]{(byte)0x34, (byte)0xA6});
- registerCodes('\u00A3', new byte[]{(byte)0x34, (byte)0xA7});
- registerCodes('\u00A4', new byte[]{(byte)0x34, (byte)0xA8});
- registerCodes('\u00A5', new byte[]{(byte)0x34, (byte)0xA9});
- registerCodes('\u00A6', new byte[]{(byte)0x2B, (byte)0x11});
- registerCodes('\u00A7', new byte[]{(byte)0x34, (byte)0xAA});
- registerCodes('\u00A8', new byte[]{(byte)0x2B, (byte)0x12});
- registerCodes('\u00A9', new byte[]{(byte)0x34, (byte)0xAB});
- registerCodes('\u00AB', new byte[]{(byte)0x33, (byte)0x05});
- registerCodes('\u00AC', new byte[]{(byte)0x34, (byte)0xAC});
- registerCodes('\u00AE', new byte[]{(byte)0x34, (byte)0xAD});
- registerCodes('\u00AF', new byte[]{(byte)0x2B, (byte)0x13});
- registerCodes('\u00B0', new byte[]{(byte)0x34, (byte)0xAE});
- registerCodes('\u00B1', new byte[]{(byte)0x33, (byte)0x04});
- registerCodes('\u00B2', new byte[]{(byte)0x3A});
- registerCodes('\u00B3', new byte[]{(byte)0x3C});
- registerCodes('\u00B4', new byte[]{(byte)0x2B, (byte)0x14});
- registerCodes('\u00B5', new byte[]{(byte)0x34, (byte)0xAF});
- registerCodes('\u00B6', new byte[]{(byte)0x34, (byte)0xB0});
- registerCodes('\u00B7', new byte[]{(byte)0x34, (byte)0xB1});
- registerCodes('\u00B8', new byte[]{(byte)0x2B, (byte)0x15});
- registerCodes('\u00B9', new byte[]{(byte)0x38});
- registerCodes('\u00BB', new byte[]{(byte)0x33, (byte)0x07});
- registerCodes('\u00BC', new byte[]{(byte)0x37, (byte)0x12});
- registerCodes('\u00BD', new byte[]{(byte)0x37, (byte)0x16});
- registerCodes('\u00BE', new byte[]{(byte)0x37, (byte)0x1A});
- registerCodes('\u00BF', new byte[]{(byte)0x2B, (byte)0x16});
- registerCodes('\u00C6', new byte[]{(byte)0x4A, (byte)0x51});
- registerCodes('\u00D7', new byte[]{(byte)0x33, (byte)0x09});
- registerCodes('\u00DE', new byte[]{(byte)0x6D, (byte)0x57});
- registerCodes('\u00DF', new byte[]{(byte)0x6B, (byte)0x6B});
- registerCodes('\u00E6', new byte[]{(byte)0x4A, (byte)0x51});
- registerCodes('\u00F7', new byte[]{(byte)0x33, (byte)0x0A});
- registerCodes('\u00FE', new byte[]{(byte)0x6D, (byte)0x57});
-
- registerUnprintableCodes('\u0001', new byte[]{(byte)0x03});
- registerUnprintableCodes('\u0002', new byte[]{(byte)0x04});
- registerUnprintableCodes('\u0003', new byte[]{(byte)0x05});
- registerUnprintableCodes('\u0004', new byte[]{(byte)0x06});
- registerUnprintableCodes('\u0005', new byte[]{(byte)0x07});
- registerUnprintableCodes('\u0006', new byte[]{(byte)0x08});
- registerUnprintableCodes('\u0007', new byte[]{(byte)0x09});
- registerUnprintableCodes('\b', new byte[]{(byte)0x0A});
- registerUnprintableCodes('\u000E', new byte[]{(byte)0x0B});
- registerUnprintableCodes('\u000F', new byte[]{(byte)0x0C});
- registerUnprintableCodes('\u0010', new byte[]{(byte)0x0D});
- registerUnprintableCodes('\u0011', new byte[]{(byte)0x0E});
- registerUnprintableCodes('\u0012', new byte[]{(byte)0x0F});
- registerUnprintableCodes('\u0013', new byte[]{(byte)0x10});
- registerUnprintableCodes('\u0014', new byte[]{(byte)0x11});
- registerUnprintableCodes('\u0015', new byte[]{(byte)0x12});
- registerUnprintableCodes('\u0016', new byte[]{(byte)0x13});
- registerUnprintableCodes('\u0017', new byte[]{(byte)0x14});
- registerUnprintableCodes('\u0018', new byte[]{(byte)0x15});
- registerUnprintableCodes('\u0019', new byte[]{(byte)0x16});
- registerUnprintableCodes('\u001A', new byte[]{(byte)0x17});
- registerUnprintableCodes('\u001B', new byte[]{(byte)0x18});
- registerUnprintableCodes('\u001C', new byte[]{(byte)0x19});
- registerUnprintableCodes('\u001D', new byte[]{(byte)0x1A});
- registerUnprintableCodes('\u001E', new byte[]{(byte)0x1B});
- registerUnprintableCodes('\u001F', new byte[]{(byte)0x1C});
- registerUnprintableCodes('\'', new byte[]{(byte)0x80});
- registerUnprintableCodes('\u002D', new byte[]{(byte)0x82});
- registerUnprintableCodes('\u007F', new byte[]{(byte)0x1D});
- registerUnprintableCodes('\u0080', new byte[]{(byte)0x1E});
- registerUnprintableCodes('\u0081', new byte[]{(byte)0x1F});
- registerUnprintableCodes('\u0082', new byte[]{(byte)0x20});
- registerUnprintableCodes('\u0083', new byte[]{(byte)0x21});
- registerUnprintableCodes('\u0084', new byte[]{(byte)0x22});
- registerUnprintableCodes('\u0085', new byte[]{(byte)0x23});
- registerUnprintableCodes('\u0086', new byte[]{(byte)0x24});
- registerUnprintableCodes('\u0087', new byte[]{(byte)0x25});
- registerUnprintableCodes('\u0088', new byte[]{(byte)0x26});
- registerUnprintableCodes('\u0089', new byte[]{(byte)0x27});
- registerUnprintableCodes('\u008A', new byte[]{(byte)0x28});
- registerUnprintableCodes('\u008B', new byte[]{(byte)0x29});
- registerUnprintableCodes('\u008C', new byte[]{(byte)0x2A});
- registerUnprintableCodes('\u008D', new byte[]{(byte)0x2B});
- registerUnprintableCodes('\u008E', new byte[]{(byte)0x2C});
- registerUnprintableCodes('\u008F', new byte[]{(byte)0x2D});
- registerUnprintableCodes('\u0090', new byte[]{(byte)0x2E});
- registerUnprintableCodes('\u0091', new byte[]{(byte)0x2F});
- registerUnprintableCodes('\u0092', new byte[]{(byte)0x30});
- registerUnprintableCodes('\u0093', new byte[]{(byte)0x31});
- registerUnprintableCodes('\u0094', new byte[]{(byte)0x32});
- registerUnprintableCodes('\u0095', new byte[]{(byte)0x33});
- registerUnprintableCodes('\u0096', new byte[]{(byte)0x34});
- registerUnprintableCodes('\u0097', new byte[]{(byte)0x35});
- registerUnprintableCodes('\u0098', new byte[]{(byte)0x36});
- registerUnprintableCodes('\u0099', new byte[]{(byte)0x37});
- registerUnprintableCodes('\u009A', new byte[]{(byte)0x38});
- registerUnprintableCodes('\u009B', new byte[]{(byte)0x39});
- registerUnprintableCodes('\u009C', new byte[]{(byte)0x3A});
- registerUnprintableCodes('\u009D', new byte[]{(byte)0x3B});
- registerUnprintableCodes('\u009E', new byte[]{(byte)0x3C});
- registerUnprintableCodes('\u009F', new byte[]{(byte)0x3D});
- registerUnprintableCodes('\u00AD', new byte[]{(byte)0x83});
-
- registerInternationalCodes('\u00AA', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x03});
- registerInternationalCodes('\u00BA', new byte[]{(byte)0x64},
- new byte[]{(byte)0x03});
- registerInternationalCodes('\u00C0', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00C1', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00C2', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00C3', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x19});
- registerInternationalCodes('\u00C4', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00C5', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x1A});
- registerInternationalCodes('\u00C7', new byte[]{(byte)0x4D},
- new byte[]{(byte)0x1C});
- registerInternationalCodes('\u00C8', new byte[]{(byte)0x51},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00C9', new byte[]{(byte)0x51},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00CA', new byte[]{(byte)0x51},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00CB', new byte[]{(byte)0x51},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00CC', new byte[]{(byte)0x59},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00CD', new byte[]{(byte)0x59},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00CE', new byte[]{(byte)0x59},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00CF', new byte[]{(byte)0x59},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00D0', new byte[]{(byte)0x4F},
- new byte[]{(byte)0x68});
- registerInternationalCodes('\u00D1', new byte[]{(byte)0x62},
- new byte[]{(byte)0x19});
- registerInternationalCodes('\u00D2', new byte[]{(byte)0x64},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00D3', new byte[]{(byte)0x64},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00D4', new byte[]{(byte)0x64},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00D5', new byte[]{(byte)0x64},
- new byte[]{(byte)0x19});
- registerInternationalCodes('\u00D6', new byte[]{(byte)0x64},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00D8', new byte[]{(byte)0x64},
- new byte[]{(byte)0x21});
- registerInternationalCodes('\u00D9', new byte[]{(byte)0x6F},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00DA', new byte[]{(byte)0x6F},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00DB', new byte[]{(byte)0x6F},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00DC', new byte[]{(byte)0x6F},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00DD', new byte[]{(byte)0x76},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00E0', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00E1', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00E2', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00E3', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x19});
- registerInternationalCodes('\u00E4', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00E5', new byte[]{(byte)0x4A},
- new byte[]{(byte)0x1A});
- registerInternationalCodes('\u00E7', new byte[]{(byte)0x4D},
- new byte[]{(byte)0x1C});
- registerInternationalCodes('\u00E8', new byte[]{(byte)0x51},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00E9', new byte[]{(byte)0x51},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00EA', new byte[]{(byte)0x51},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00EB', new byte[]{(byte)0x51},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00EC', new byte[]{(byte)0x59},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00ED', new byte[]{(byte)0x59},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00EE', new byte[]{(byte)0x59},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00EF', new byte[]{(byte)0x59},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00F0', new byte[]{(byte)0x4F},
- new byte[]{(byte)0x68});
- registerInternationalCodes('\u00F1', new byte[]{(byte)0x62},
- new byte[]{(byte)0x19});
- registerInternationalCodes('\u00F2', new byte[]{(byte)0x64},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00F3', new byte[]{(byte)0x64},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00F4', new byte[]{(byte)0x64},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00F5', new byte[]{(byte)0x64},
- new byte[]{(byte)0x19});
- registerInternationalCodes('\u00F6', new byte[]{(byte)0x64},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00F8', new byte[]{(byte)0x64},
- new byte[]{(byte)0x21});
- registerInternationalCodes('\u00F9', new byte[]{(byte)0x6F},
- new byte[]{(byte)0x0F});
- registerInternationalCodes('\u00FA', new byte[]{(byte)0x6F},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00FB', new byte[]{(byte)0x6F},
- new byte[]{(byte)0x12});
- registerInternationalCodes('\u00FC', new byte[]{(byte)0x6F},
- new byte[]{(byte)0x13});
- registerInternationalCodes('\u00FD', new byte[]{(byte)0x76},
- new byte[]{(byte)0x0E});
- registerInternationalCodes('\u00FF', new byte[]{(byte)0x76},
- new byte[]{(byte)0x13});
-
+ static final byte INTERNATIONAL_EXTRA_PLACEHOLDER = (byte)0x02;
+
+ // see Index.writeCrazyCodes for details on writing crazy codes
+ static final byte CRAZY_CODE_START = (byte)0x80;
+ static final byte CRAZY_CODE_1 = (byte)0x02;
+ static final byte CRAZY_CODE_2 = (byte)0x03;
+ static final byte[] CRAZY_CODES_SUFFIX =
+ new byte[]{(byte)0xFF, (byte)0x02, (byte)0x80, (byte)0xFF, (byte)0x80};
+
+ // stash the codes in some resource files
+ private static final String CODES_FILE =
+ "com/healthmarketscience/jackcess/index_codes.txt";
+ private static final String EXT_CODES_FILE =
+ "com/healthmarketscience/jackcess/index_codes_ext.txt";
+
+ enum Type {
+ SIMPLE("S") {
+ @Override public CharHandler parseCodes(String[] codeStrings) {
+ return parseSimpleCodes(codeStrings);
+ }
+ },
+ INTERNATIONAL("I") {
+ @Override public CharHandler parseCodes(String[] codeStrings) {
+ return parseInternationalCodes(codeStrings);
+ }
+ },
+ UNPRINTABLE("U") {
+ @Override public CharHandler parseCodes(String[] codeStrings) {
+ return parseUnprintableCodes(codeStrings);
+ }
+ },
+ UNPRINTABLE_EXT("P") {
+ @Override public CharHandler parseCodes(String[] codeStrings) {
+ return parseUnprintableExtCodes(codeStrings);
+ }
+ },
+ INTERNATIONAL_EXT("Z") {
+ @Override public CharHandler parseCodes(String[] codeStrings) {
+ return parseInternationalExtCodes(codeStrings);
+ }
+ },
+ IGNORED("X") {
+ @Override public CharHandler parseCodes(String[] codeStrings) {
+ return IGNORED_CHAR_HANDLER;
+ }
+ };
+
+ private final String _prefixCode;
+
+ private Type(String prefixCode) {
+ _prefixCode = prefixCode;
+ }
+
+ public String getPrefixCode() {
+ return _prefixCode;
+ }
+
+ public abstract CharHandler parseCodes(String[] codeStrings);
}
-
- private IndexCodes() {
+ abstract static class CharHandler {
+ public abstract Type getType();
+ public byte[] getInlineBytes() {
+ return null;
+ }
+ public byte[] getExtraBytes() {
+ return null;
+ }
+ public byte[] getUnprintableBytes() {
+ return null;
+ }
+ public byte getExtraByteModifier() {
+ return 0;
+ }
+ public byte getCrazyFlag() {
+ return 0;
+ }
}
- private static void registerCodes(char c, byte[] codes) {
- CODES.put(c, codes);
+ private static final class SimpleCharHandler extends CharHandler {
+ private byte[] _bytes;
+ private SimpleCharHandler(byte[] bytes) {
+ _bytes = bytes;
+ }
+ @Override public Type getType() {
+ return Type.SIMPLE;
+ }
+ @Override public byte[] getInlineBytes() {
+ return _bytes;
+ }
}
-
- private static void registerUnprintableCodes(char c, byte[] codes) {
- UNPRINTABLE_CODES.put(c, codes);
+
+ private static final class InternationalCharHandler extends CharHandler {
+ private byte[] _bytes;
+ private byte[] _extraBytes;
+ private InternationalCharHandler(byte[] bytes, byte[] extraBytes) {
+ _bytes = bytes;
+ _extraBytes = extraBytes;
+ }
+ @Override public Type getType() {
+ return Type.INTERNATIONAL;
+ }
+ @Override public byte[] getInlineBytes() {
+ return _bytes;
+ }
+ @Override public byte[] getExtraBytes() {
+ return _extraBytes;
+ }
+ }
+
+ private static final class UnprintableCharHandler extends CharHandler {
+ private byte[] _unprintBytes;
+ private UnprintableCharHandler(byte[] unprintBytes) {
+ _unprintBytes = unprintBytes;
+ }
+ @Override public Type getType() {
+ return Type.UNPRINTABLE;
+ }
+ @Override public byte[] getUnprintableBytes() {
+ return _unprintBytes;
+ }
+ }
+
+ private static final class UnprintableExtCharHandler extends CharHandler {
+ private byte _extraByteMod;
+ private UnprintableExtCharHandler(Byte extraByteMod) {
+ _extraByteMod = extraByteMod;
+ }
+ @Override public Type getType() {
+ return Type.UNPRINTABLE_EXT;
+ }
+ @Override public byte getExtraByteModifier() {
+ return _extraByteMod;
+ }
+ }
+
+ private static final class InternationalExtCharHandler extends CharHandler {
+ private byte[] _bytes;
+ private byte[] _extraBytes;
+ private byte _crazyFlag;
+ private InternationalExtCharHandler(byte[] bytes, byte[] extraBytes,
+ byte crazyFlag) {
+ _bytes = bytes;
+ _extraBytes = extraBytes;
+ _crazyFlag = crazyFlag;
+ }
+ @Override public Type getType() {
+ return Type.INTERNATIONAL_EXT;
+ }
+ @Override public byte[] getInlineBytes() {
+ return _bytes;
+ }
+ @Override public byte[] getExtraBytes() {
+ return _extraBytes;
+ }
+ @Override public byte getCrazyFlag() {
+ return _crazyFlag;
+ }
+ }
+
+ static final CharHandler IGNORED_CHAR_HANDLER = new CharHandler() {
+ @Override public Type getType() {
+ return Type.IGNORED;
+ }
+ };
+
+ static final CharHandler SURROGATE_CHAR_HANDLER = new CharHandler() {
+ @Override public Type getType() {
+ return Type.IGNORED;
+ }
+ @Override public byte[] getInlineBytes() {
+ throw new IllegalStateException(
+ "Surrogate pair chars are not handled");
+ }
+ };
+
+ private static final char FIRST_CHAR = (char)0x0000;
+ private static final char LAST_CHAR = (char)0x00FF;
+ private static final char FIRST_EXT_CHAR = LAST_CHAR + 1;
+ private static final char LAST_EXT_CHAR = (char)0xFFFF;
+
+ private static final class Codes
+ {
+ /** handlers for the first 256 chars. use nested class to lazy load the
+ handlers */
+ private static final CharHandler[] _values = loadCodes(
+ CODES_FILE, FIRST_CHAR, LAST_CHAR);
}
- private static void registerInternationalCodes(
- char c, byte[] inlineCodes, byte[] extraCodes) {
- INTERNATIONAL_CODES.put(c,
- new InternationalCodes(inlineCodes, extraCodes));
+ private static final class ExtCodes
+ {
+ /** handlers for the rest of the chars in BMP 0. use nested class to
+ lazy load the handlers */
+ private static final CharHandler[] _values = loadCodes(
+ EXT_CODES_FILE, FIRST_EXT_CHAR, LAST_EXT_CHAR);
}
+ private IndexCodes() {
+ }
+
+ static CharHandler getCharHandler(char c)
+ {
+ if(c <= LAST_CHAR) {
+ return Codes._values[c];
+ }
+
+ int extOffset = asUnsignedChar(c) - asUnsignedChar(FIRST_EXT_CHAR);
+ return ExtCodes._values[extOffset];
+ }
+
+ private static CharHandler[] loadCodes(String codesFilePath,
+ char firstChar, char lastChar)
+ {
+ int numCodes = (asUnsignedChar(lastChar) - asUnsignedChar(firstChar)) + 1;
+ CharHandler[] values = new CharHandler[numCodes];
+
+ Map<String,Type> prefixMap = new HashMap<String,Type>();
+ for(Type type : Type.values()) {
+ prefixMap.put(type.getPrefixCode(), type);
+ }
+
+ BufferedReader reader = null;
+ try {
+
+ reader = new BufferedReader(
+ new InputStreamReader(
+ Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(codesFilePath), "US-ASCII"));
+
+ int start = asUnsignedChar(firstChar);
+ int end = asUnsignedChar(lastChar);
+ for(int i = start; i <= end; ++i) {
+ char c = (char)i;
+ CharHandler ch = null;
+ if(Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
+ // surrogate chars are not included in the codes files
+ ch = SURROGATE_CHAR_HANDLER;
+ } else {
+ String codeLine = reader.readLine();
+ ch = parseCodes(prefixMap, codeLine);
+ }
+ values[(i - start)] = ch;
+ }
+
+ } catch(IOException e) {
+ throw new RuntimeException("failed loading index codes file " +
+ codesFilePath, e);
+ } finally {
+ if (reader != null) {
+ try {
+ reader.close();
+ } catch (IOException ex) {
+ // ignored
+ }
+ }
+ }
+
+ return values;
+ }
+
+ private static CharHandler parseCodes(Map<String,Type> prefixMap,
+ String codeLine)
+ {
+ String prefix = codeLine.substring(0, 1);
+ String suffix = ((codeLine.length() > 1) ? codeLine.substring(2) : "");
+ return prefixMap.get(prefix).parseCodes(suffix.split(",", -1));
+ }
+
+ private static CharHandler parseSimpleCodes(String[] codeStrings)
+ {
+ if(codeStrings.length != 1) {
+ throw new IllegalStateException("Unexpected code strings " +
+ Arrays.asList(codeStrings));
+ }
+ return new SimpleCharHandler(codesToBytes(codeStrings[0], true));
+ }
+
+ private static CharHandler parseInternationalCodes(String[] codeStrings)
+ {
+ if(codeStrings.length != 2) {
+ throw new IllegalStateException("Unexpected code strings " +
+ Arrays.asList(codeStrings));
+ }
+ return new InternationalCharHandler(codesToBytes(codeStrings[0], true),
+ codesToBytes(codeStrings[1], true));
+ }
+
+ private static CharHandler parseUnprintableCodes(String[] codeStrings)
+ {
+ if(codeStrings.length != 1) {
+ throw new IllegalStateException("Unexpected code strings " +
+ Arrays.asList(codeStrings));
+ }
+ return new UnprintableCharHandler(codesToBytes(codeStrings[0], true));
+ }
+
+ private static CharHandler parseUnprintableExtCodes(String[] codeStrings)
+ {
+ if(codeStrings.length != 1) {
+ throw new IllegalStateException("Unexpected code strings " +
+ Arrays.asList(codeStrings));
+ }
+ byte[] bytes = codesToBytes(codeStrings[0], true);
+ if(bytes.length != 1) {
+ throw new IllegalStateException("Unexpected code strings " +
+ Arrays.asList(codeStrings));
+ }
+ return new UnprintableExtCharHandler(bytes[0]);
+ }
+
+ private static CharHandler parseInternationalExtCodes(String[] codeStrings)
+ {
+ if(codeStrings.length != 3) {
+ throw new IllegalStateException("Unexpected code strings " +
+ Arrays.asList(codeStrings));
+ }
+
+ byte crazyFlag = ("1".equals(codeStrings[2]) ?
+ CRAZY_CODE_1 : CRAZY_CODE_2);
+ return new InternationalExtCharHandler(codesToBytes(codeStrings[0], true),
+ codesToBytes(codeStrings[1], false),
+ crazyFlag);
+ }
+
+ private static byte[] codesToBytes(String codes, boolean required)
+ {
+ if(codes.length() == 0) {
+ if(required) {
+ throw new IllegalStateException("empty code bytes");
+ }
+ return null;
+ }
+ byte[] bytes = new byte[codes.length() / 2];
+ for(int i = 0; i < bytes.length; ++i) {
+ int charIdx = i*2;
+ bytes[i] = (byte)(Integer.parseInt(codes.substring(charIdx, charIdx + 2),
+ 16));
+ }
+ return bytes;
+ }
+
+ private static int asUnsignedChar(char c)
+ {
+ return c & 0xFFFF;
+ }
+
static boolean isNullEntry(byte startEntryFlag) {
return((startEntryFlag == ASC_NULL_FLAG) ||
(startEntryFlag == DESC_NULL_FLAG));
@@ -448,14 +432,4 @@ public class IndexCodes {
return(isAscending ? ASC_START_FLAG : DESC_START_FLAG);
}
- static final class InternationalCodes {
- public final byte[] _inlineCodes;
- public final byte[] _extraCodes;
-
- private InternationalCodes(byte[] inlineCodes, byte[] extraCodes) {
- _inlineCodes = inlineCodes;
- _extraCodes = extraCodes;
- }
- }
-
}