Browse Source

initial index support for all characters in BMP 0

git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@425 f203690c-595d-4dc9-a70b-905162fa7fd2
tags/jackcess-1.1.21
James Ahlborn 14 years ago
parent
commit
b9f8f8502c

+ 218
- 83
src/java/com/healthmarketscience/jackcess/Index.java View File

@@ -35,8 +35,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

@@ -1045,110 +1043,96 @@ public abstract class Index implements Comparable<Index> {
}
// now, convert each character to a "code" of one or more bytes
List<ExtraCodes> unprintableCodes = null;
List<ExtraCodes> internationalCodes = null;
ExtraCodesOutputStream extraCodes = null;
ExtraCodesOutputStream unprintableCodes = null;
ExtraCodesOutputStream crazyCodes = null;
int charOffset = 0;
for(int i = 0; i < str.length(); ++i) {

char c = str.charAt(i);
Character cKey = c;
CharHandler ch = getCharHandler(c);

byte[] bytes = CODES.get(cKey);
int curCharOffset = charOffset;
byte[] bytes = ch.getInlineBytes();
if(bytes != null) {
// simple case, write the codes we found
// write the "inline" codes immediately
tmpBout.write(bytes);

// only increment the charOffset for chars with inline codes
++charOffset;
continue;
}

bytes = UNPRINTABLE_CODES.get(cKey);
if(bytes != null) {
// we do not write anything to tmpBout
if(bytes.length > 0) {
if(unprintableCodes == null) {
unprintableCodes = new LinkedList<ExtraCodes>();
}
// keep track of the extra codes for later
unprintableCodes.add(new ExtraCodes(charOffset, bytes));
}

// note, we do _not_ increment the charOffset for unprintable chars
if(ch.getType() == Type.SIMPLE) {
// common case, skip further code handling
continue;
}

InternationalCodes inatCodes = INTERNATIONAL_CODES.get(cKey);
if(inatCodes != null) {

// we write the "inline" portion of the international codes
// immediately, and queue the extra codes for later
tmpBout.write(inatCodes._inlineCodes);

if(internationalCodes == null) {
internationalCodes = new LinkedList<ExtraCodes>();
bytes = ch.getExtraBytes();
byte extraCodeModifier = ch.getExtraByteModifier();
if((bytes != null) || (extraCodeModifier != 0)) {
if(extraCodes == null) {
extraCodes = new ExtraCodesOutputStream(str.length());
}

// keep track of the extra codes for later
internationalCodes.add(new ExtraCodes(charOffset,
inatCodes._extraCodes));
writeExtraCodes(curCharOffset, bytes, extraCodeModifier, extraCodes);
}

++charOffset;
continue;
bytes = ch.getUnprintableBytes();
if(bytes != null) {
if(unprintableCodes == null) {
unprintableCodes = new ExtraCodesOutputStream();
}
// keep track of the unprintable codes for later
writeUnprintableCodes(curCharOffset, bytes, unprintableCodes);
}
byte crazyFlag = ch.getCrazyFlag();
if(crazyFlag != 0) {
if(crazyCodes == null) {
crazyCodes = new ExtraCodesOutputStream();
}

// bummer, out of luck
throw new IOException("unmapped string index value " + c);
// keep track of the crazy flags for later
crazyCodes.write(crazyFlag);
}
}

// write end text flag
tmpBout.write(END_TEXT);

boolean hasExtraText = ((unprintableCodes != null) ||
(internationalCodes != null));
if(hasExtraText) {
boolean hasExtraCodes = trimExtraCodes(
extraCodes, (byte)0, INTERNATIONAL_EXTRA_PLACEHOLDER);
boolean hasUnprintableCodes = (unprintableCodes != null);
boolean hasCrazyCodes = (crazyCodes != null);
if(hasExtraCodes || hasUnprintableCodes || hasCrazyCodes) {

// we write all the international extra bytes first
if(internationalCodes != null) {

// we write a placeholder char for each non-international char before
// the extra chars for the international char
charOffset = 0;
Iterator<ExtraCodes> iter = internationalCodes.iterator();
while(iter.hasNext()) {
ExtraCodes extraCodes = iter.next();
while(charOffset < extraCodes._charOffset) {
tmpBout.write(INTERNATIONAL_EXTRA_PLACEHOLDER);
++charOffset;
}
tmpBout.write(extraCodes._extraCodes);
++charOffset;
}
if(hasExtraCodes) {
extraCodes.writeTo(tmpBout);
}

// then we write all the unprintable extra bytes
if(unprintableCodes != null) {
if(hasCrazyCodes || hasUnprintableCodes) {

// write 2 more end flags
tmpBout.write(END_TEXT);
tmpBout.write(END_TEXT);

// write a single prefix for all unprintable chars
tmpBout.write(UNPRINTABLE_COMMON_PREFIX);
// next come the crazy flags
if(hasCrazyCodes) {
writeCrazyCodes(crazyCodes, tmpBout);
}

// then we write all the unprintable extra bytes
if(hasUnprintableCodes) {

// write another end flag
tmpBout.write(END_TEXT);
// we write a whacky combo of bytes for each unprintable char which
// includes a funky offset and extra char itself
Iterator<ExtraCodes> iter = unprintableCodes.iterator();
while(iter.hasNext()) {
ExtraCodes extraCodes = iter.next();
int offset =
(UNPRINTABLE_COUNT_START +
(UNPRINTABLE_COUNT_MULTIPLIER * extraCodes._charOffset))
| UNPRINTABLE_OFFSET_FLAGS;

// write offset as big-endian short
tmpBout.write((offset >> 8) & 0xFF);
tmpBout.write(offset & 0xFF);
tmpBout.write(UNPRINTABLE_MIDFIX);
tmpBout.write(extraCodes._extraCodes);
unprintableCodes.writeTo(tmpBout);
}
}

}

// handle descending order by inverting the bytes
@@ -1168,6 +1152,111 @@ public abstract class Index implements Comparable<Index> {
bout.write(END_EXTRA_TEXT);
}

private static void writeExtraCodes(
int charOffset, byte[] bytes, byte extraCodeModifier,
ExtraCodesOutputStream extraCodes)
throws IOException
{
// we fill in a placeholder value for any chars w/out extra codes
int numChars = extraCodes.getNumChars();
if(numChars < charOffset) {
int fillChars = charOffset - numChars;
extraCodes.write(fillChars, INTERNATIONAL_EXTRA_PLACEHOLDER);
extraCodes.incrementNumChars(fillChars);
}

if(bytes != null) {
// write the actual extra codes and update the number of chars
extraCodes.write(bytes);
extraCodes.incrementNumChars(1);

} else {

// the extra code modifier is added to the last extra code written. if
// there is no previous extra code, it is made the first extra code.
int lastIdx = extraCodes.size() - 1;
if(lastIdx >= 0) {
byte lastByte = extraCodes.get(lastIdx);
lastByte += extraCodeModifier;
extraCodes.set(lastIdx, lastByte);
} else {
extraCodes.write(extraCodeModifier);
}
}
}

private static boolean trimExtraCodes(ExtraCodesOutputStream extraCodes,
byte minTrimCode, byte maxTrimCode)
throws IOException
{
if(extraCodes == null) {
return false;
}

extraCodes.trimTrailingBytes(minTrimCode, maxTrimCode);

// anything left?
return (extraCodes.size() > 0);
}

private static void writeUnprintableCodes(
int charOffset, byte[] bytes, ExtraCodesOutputStream extraCodes)
throws IOException
{
// we write a whacky combo of bytes for each unprintable char which
// includes a funky offset and extra char itself
int offset =
(UNPRINTABLE_COUNT_START +
(UNPRINTABLE_COUNT_MULTIPLIER * charOffset))
| UNPRINTABLE_OFFSET_FLAGS;

// write offset as big-endian short
extraCodes.write((offset >> 8) & 0xFF);
extraCodes.write(offset & 0xFF);
extraCodes.write(UNPRINTABLE_MIDFIX);
extraCodes.write(bytes);
}

private static void writeCrazyCodes(ExtraCodesOutputStream crazyCodes,
ByteArrayOutputStream tmpBout)
throws IOException
{
// CRAZY_CODE_2 flags at the end are ignored, so ditch them
trimExtraCodes(crazyCodes, CRAZY_CODE_2, CRAZY_CODE_2);

if(crazyCodes.size() > 0) {

// the crazy codes get encoded into 6 bit sequences where each code is 2
// bits (where the first 2 bits in the byte are a common prefix).
byte curByte = CRAZY_CODE_START;
int idx = 0;
for(int i = 0; i < crazyCodes.size(); ++i) {
byte nextByte = crazyCodes.get(i);
nextByte <<= ((2 - idx) * 2);
curByte |= nextByte;

++idx;
if(idx == 3) {
// write current byte and reset
tmpBout.write(curByte);
curByte = CRAZY_CODE_START;
idx = 0;
}
}

// write last byte
if(idx > 0) {
tmpBout.write(curByte);
}
}

// write crazy code suffix (note, we write this even if all the codes are
// trmmed
tmpBout.write(CRAZY_CODES_SUFFIX);
}

/**
* Creates one of the special index entries.
*/
@@ -2289,17 +2378,63 @@ public abstract class Index implements Comparable<Index> {
entries);
}
}


/**
* Value struct used during Text index entry encoding.
* Extension of ByteArrayOutputStream which allows more complex access to
* the underlying bytes, as well as keeps track of an additional char count.
*/
private static final class ExtraCodes {
public final int _charOffset;
public final byte[] _extraCodes;
private static final class ExtraCodesOutputStream
extends ByteArrayOutputStream
{
private int numChars;

private ExtraCodesOutputStream() {
super();
}

private ExtraCodesOutputStream(int length) {
super(length);
}

public int getNumChars() {
return numChars;
}
public byte get(int offset) {
return buf[offset];
}

public void set(int offset, byte b) {
buf[offset] = b;
}

public void write(int numBytes, byte b) {
for(int i = 0; i < numBytes; ++i) {
write(b);
}
}

public void incrementNumChars(int inc) {
numChars += inc;
}

public void trimTrailingBytes(byte minTrimCode, byte maxTrimCode)
{
int minTrim = ByteUtil.asUnsignedByte(minTrimCode);
int maxTrim = ByteUtil.asUnsignedByte(maxTrimCode);

int idx = count - 1;
while(idx >= 0) {
int val = ByteUtil.asUnsignedByte(get(idx));
if((val >= minTrim) && (val <= maxTrim)) {
--idx;
} else {
break;
}
}

private ExtraCodes(int charOffset, byte[] extraCodes) {
_charOffset = charOffset;
_extraCodes = extraCodes;
count = idx + 1;
}
}


+ 342
- 368
src/java/com/healthmarketscience/jackcess/IndexCodes.java View File

@@ -27,6 +27,10 @@ King of Prussia, PA 19406

package com.healthmarketscience.jackcess;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

@@ -54,7 +58,7 @@ public class IndexCodes {
static final byte DESC_BOOLEAN_TRUE = ASC_BOOLEAN_FALSE;
static final byte DESC_BOOLEAN_FALSE = ASC_BOOLEAN_TRUE;

// unprintable char is removed from normal text.
// pattern for unprintable chars in the extra bytes:
@@ -63,378 +67,358 @@ public class IndexCodes {
// <code> = char code
static final int UNPRINTABLE_COUNT_START = 7;
static final int UNPRINTABLE_COUNT_MULTIPLIER = 4;
static final byte[] UNPRINTABLE_COMMON_PREFIX =
new byte[]{(byte)0x01, (byte)0x01, (byte)0x01};
static final int UNPRINTABLE_OFFSET_FLAGS = 0x8000;
static final byte UNPRINTABLE_MIDFIX = (byte)0x06;

// international char is replaced with ascii char.
// pattern for international chars in the extra bytes:
// [ 02 (for each normal char) ] [ <symbol_code> (for each inat char) ]
static final byte INTERNATIONAL_EXTRA_PLACEHOLDER = (byte)0x02;
/**
* Map of character to byte[] that Access uses in indexes (not ASCII)
* (Character -> byte[]) as codes to order text
*/
static final Map<Character, byte[]> CODES =
new HashMap<Character, byte[]>(150);

/**
* Map of character to byte[] that Access uses in indexes for unprintable
* characters (not ASCII) (Character -> byte[]), in the extended portion
*/
static final Map<Character, byte[]> UNPRINTABLE_CODES =
new HashMap<Character, byte[]>(100);
/**
* Map of character to byte[] that Access uses in indexes for international
* characters (not ASCII) (Character -> InternationalCodes), in the extended
* portion
*/
static final Map<Character, InternationalCodes> INTERNATIONAL_CODES =
new HashMap<Character, InternationalCodes>(70);
static {
registerCodes('\u0000', new byte[]{});
registerCodes('\t', new byte[]{(byte)0x08, (byte)0x03});
registerCodes('\n', new byte[]{(byte)0x08, (byte)0x04});
registerCodes('\u000B', new byte[]{(byte)0x08, (byte)0x05});
registerCodes('\f', new byte[]{(byte)0x08, (byte)0x06});
registerCodes('\r', new byte[]{(byte)0x08, (byte)0x07});
registerCodes('\u0020', new byte[]{(byte)0x07});
registerCodes('\u0021', new byte[]{(byte)0x09});
registerCodes('\"', new byte[]{(byte)0x0A});
registerCodes('\u0023', new byte[]{(byte)0x0C});
registerCodes('\u0024', new byte[]{(byte)0x0E});
registerCodes('\u0025', new byte[]{(byte)0x10});
registerCodes('\u0026', new byte[]{(byte)0x12});
registerCodes('\u0028', new byte[]{(byte)0x14});
registerCodes('\u0029', new byte[]{(byte)0x16});
registerCodes('\u002A', new byte[]{(byte)0x18});
registerCodes('\u002B', new byte[]{(byte)0x2C});
registerCodes('\u002C', new byte[]{(byte)0x1A});
registerCodes('\u002E', new byte[]{(byte)0x1C});
registerCodes('\u002F', new byte[]{(byte)0x1E});
registerCodes('\u0030', new byte[]{(byte)0x36});
registerCodes('\u0031', new byte[]{(byte)0x38});
registerCodes('\u0032', new byte[]{(byte)0x3A});
registerCodes('\u0033', new byte[]{(byte)0x3C});
registerCodes('\u0034', new byte[]{(byte)0x3E});
registerCodes('\u0035', new byte[]{(byte)0x40});
registerCodes('\u0036', new byte[]{(byte)0x42});
registerCodes('\u0037', new byte[]{(byte)0x44});
registerCodes('\u0038', new byte[]{(byte)0x46});
registerCodes('\u0039', new byte[]{(byte)0x48});
registerCodes('\u003A', new byte[]{(byte)0x20});
registerCodes('\u003B', new byte[]{(byte)0x22});
registerCodes('\u003C', new byte[]{(byte)0x2E});
registerCodes('\u003D', new byte[]{(byte)0x30});
registerCodes('\u003E', new byte[]{(byte)0x32});
registerCodes('\u003F', new byte[]{(byte)0x24});
registerCodes('\u0040', new byte[]{(byte)0x26});
registerCodes('\u0041', new byte[]{(byte)0x4A});
registerCodes('\u0042', new byte[]{(byte)0x4C});
registerCodes('\u0043', new byte[]{(byte)0x4D});
registerCodes('\u0044', new byte[]{(byte)0x4F});
registerCodes('\u0045', new byte[]{(byte)0x51});
registerCodes('\u0046', new byte[]{(byte)0x53});
registerCodes('\u0047', new byte[]{(byte)0x55});
registerCodes('\u0048', new byte[]{(byte)0x57});
registerCodes('\u0049', new byte[]{(byte)0x59});
registerCodes('\u004A', new byte[]{(byte)0x5B});
registerCodes('\u004B', new byte[]{(byte)0x5C});
registerCodes('\u004C', new byte[]{(byte)0x5E});
registerCodes('\u004D', new byte[]{(byte)0x60});
registerCodes('\u004E', new byte[]{(byte)0x62});
registerCodes('\u004F', new byte[]{(byte)0x64});
registerCodes('\u0050', new byte[]{(byte)0x66});
registerCodes('\u0051', new byte[]{(byte)0x68});
registerCodes('\u0052', new byte[]{(byte)0x69});
registerCodes('\u0053', new byte[]{(byte)0x6B});
registerCodes('\u0054', new byte[]{(byte)0x6D});
registerCodes('\u0055', new byte[]{(byte)0x6F});
registerCodes('\u0056', new byte[]{(byte)0x71});
registerCodes('\u0057', new byte[]{(byte)0x73});
registerCodes('\u0058', new byte[]{(byte)0x75});
registerCodes('\u0059', new byte[]{(byte)0x76});
registerCodes('\u005A', new byte[]{(byte)0x78});
registerCodes('\u005B', new byte[]{(byte)0x27});
registerCodes('\\', new byte[]{(byte)0x29});
registerCodes('\u005D', new byte[]{(byte)0x2A});
registerCodes('\u005E', new byte[]{(byte)0x2B, (byte)0x02});
registerCodes('\u005F', new byte[]{(byte)0x2B, (byte)0x03});
registerCodes('\u0060', new byte[]{(byte)0x2B, (byte)0x07});
registerCodes('\u0061', new byte[]{(byte)0x4A});
registerCodes('\u0062', new byte[]{(byte)0x4C});
registerCodes('\u0063', new byte[]{(byte)0x4D});
registerCodes('\u0064', new byte[]{(byte)0x4F});
registerCodes('\u0065', new byte[]{(byte)0x51});
registerCodes('\u0066', new byte[]{(byte)0x53});
registerCodes('\u0067', new byte[]{(byte)0x55});
registerCodes('\u0068', new byte[]{(byte)0x57});
registerCodes('\u0069', new byte[]{(byte)0x59});
registerCodes('\u006A', new byte[]{(byte)0x5B});
registerCodes('\u006B', new byte[]{(byte)0x5C});
registerCodes('\u006C', new byte[]{(byte)0x5E});
registerCodes('\u006D', new byte[]{(byte)0x60});
registerCodes('\u006E', new byte[]{(byte)0x62});
registerCodes('\u006F', new byte[]{(byte)0x64});
registerCodes('\u0070', new byte[]{(byte)0x66});
registerCodes('\u0071', new byte[]{(byte)0x68});
registerCodes('\u0072', new byte[]{(byte)0x69});
registerCodes('\u0073', new byte[]{(byte)0x6B});
registerCodes('\u0074', new byte[]{(byte)0x6D});
registerCodes('\u0075', new byte[]{(byte)0x6F});
registerCodes('\u0076', new byte[]{(byte)0x71});
registerCodes('\u0077', new byte[]{(byte)0x73});
registerCodes('\u0078', new byte[]{(byte)0x75});
registerCodes('\u0079', new byte[]{(byte)0x76});
registerCodes('\u007A', new byte[]{(byte)0x78});
registerCodes('\u007B', new byte[]{(byte)0x2B, (byte)0x09});
registerCodes('\u007C', new byte[]{(byte)0x2B, (byte)0x0B});
registerCodes('\u007D', new byte[]{(byte)0x2B, (byte)0x0D});
registerCodes('\u007E', new byte[]{(byte)0x2B, (byte)0x0F});
registerCodes('\u00A0', new byte[]{(byte)0x08, (byte)0x02});
registerCodes('\u00A1', new byte[]{(byte)0x2B, (byte)0x10});
registerCodes('\u00A2', new byte[]{(byte)0x34, (byte)0xA6});
registerCodes('\u00A3', new byte[]{(byte)0x34, (byte)0xA7});
registerCodes('\u00A4', new byte[]{(byte)0x34, (byte)0xA8});
registerCodes('\u00A5', new byte[]{(byte)0x34, (byte)0xA9});
registerCodes('\u00A6', new byte[]{(byte)0x2B, (byte)0x11});
registerCodes('\u00A7', new byte[]{(byte)0x34, (byte)0xAA});
registerCodes('\u00A8', new byte[]{(byte)0x2B, (byte)0x12});
registerCodes('\u00A9', new byte[]{(byte)0x34, (byte)0xAB});
registerCodes('\u00AB', new byte[]{(byte)0x33, (byte)0x05});
registerCodes('\u00AC', new byte[]{(byte)0x34, (byte)0xAC});
registerCodes('\u00AE', new byte[]{(byte)0x34, (byte)0xAD});
registerCodes('\u00AF', new byte[]{(byte)0x2B, (byte)0x13});
registerCodes('\u00B0', new byte[]{(byte)0x34, (byte)0xAE});
registerCodes('\u00B1', new byte[]{(byte)0x33, (byte)0x04});
registerCodes('\u00B2', new byte[]{(byte)0x3A});
registerCodes('\u00B3', new byte[]{(byte)0x3C});
registerCodes('\u00B4', new byte[]{(byte)0x2B, (byte)0x14});
registerCodes('\u00B5', new byte[]{(byte)0x34, (byte)0xAF});
registerCodes('\u00B6', new byte[]{(byte)0x34, (byte)0xB0});
registerCodes('\u00B7', new byte[]{(byte)0x34, (byte)0xB1});
registerCodes('\u00B8', new byte[]{(byte)0x2B, (byte)0x15});
registerCodes('\u00B9', new byte[]{(byte)0x38});
registerCodes('\u00BB', new byte[]{(byte)0x33, (byte)0x07});
registerCodes('\u00BC', new byte[]{(byte)0x37, (byte)0x12});
registerCodes('\u00BD', new byte[]{(byte)0x37, (byte)0x16});
registerCodes('\u00BE', new byte[]{(byte)0x37, (byte)0x1A});
registerCodes('\u00BF', new byte[]{(byte)0x2B, (byte)0x16});
registerCodes('\u00C6', new byte[]{(byte)0x4A, (byte)0x51});
registerCodes('\u00D7', new byte[]{(byte)0x33, (byte)0x09});
registerCodes('\u00DE', new byte[]{(byte)0x6D, (byte)0x57});
registerCodes('\u00DF', new byte[]{(byte)0x6B, (byte)0x6B});
registerCodes('\u00E6', new byte[]{(byte)0x4A, (byte)0x51});
registerCodes('\u00F7', new byte[]{(byte)0x33, (byte)0x0A});
registerCodes('\u00FE', new byte[]{(byte)0x6D, (byte)0x57});

registerUnprintableCodes('\u0001', new byte[]{(byte)0x03});
registerUnprintableCodes('\u0002', new byte[]{(byte)0x04});
registerUnprintableCodes('\u0003', new byte[]{(byte)0x05});
registerUnprintableCodes('\u0004', new byte[]{(byte)0x06});
registerUnprintableCodes('\u0005', new byte[]{(byte)0x07});
registerUnprintableCodes('\u0006', new byte[]{(byte)0x08});
registerUnprintableCodes('\u0007', new byte[]{(byte)0x09});
registerUnprintableCodes('\b', new byte[]{(byte)0x0A});
registerUnprintableCodes('\u000E', new byte[]{(byte)0x0B});
registerUnprintableCodes('\u000F', new byte[]{(byte)0x0C});
registerUnprintableCodes('\u0010', new byte[]{(byte)0x0D});
registerUnprintableCodes('\u0011', new byte[]{(byte)0x0E});
registerUnprintableCodes('\u0012', new byte[]{(byte)0x0F});
registerUnprintableCodes('\u0013', new byte[]{(byte)0x10});
registerUnprintableCodes('\u0014', new byte[]{(byte)0x11});
registerUnprintableCodes('\u0015', new byte[]{(byte)0x12});
registerUnprintableCodes('\u0016', new byte[]{(byte)0x13});
registerUnprintableCodes('\u0017', new byte[]{(byte)0x14});
registerUnprintableCodes('\u0018', new byte[]{(byte)0x15});
registerUnprintableCodes('\u0019', new byte[]{(byte)0x16});
registerUnprintableCodes('\u001A', new byte[]{(byte)0x17});
registerUnprintableCodes('\u001B', new byte[]{(byte)0x18});
registerUnprintableCodes('\u001C', new byte[]{(byte)0x19});
registerUnprintableCodes('\u001D', new byte[]{(byte)0x1A});
registerUnprintableCodes('\u001E', new byte[]{(byte)0x1B});
registerUnprintableCodes('\u001F', new byte[]{(byte)0x1C});
registerUnprintableCodes('\'', new byte[]{(byte)0x80});
registerUnprintableCodes('\u002D', new byte[]{(byte)0x82});
registerUnprintableCodes('\u007F', new byte[]{(byte)0x1D});
registerUnprintableCodes('\u0080', new byte[]{(byte)0x1E});
registerUnprintableCodes('\u0081', new byte[]{(byte)0x1F});
registerUnprintableCodes('\u0082', new byte[]{(byte)0x20});
registerUnprintableCodes('\u0083', new byte[]{(byte)0x21});
registerUnprintableCodes('\u0084', new byte[]{(byte)0x22});
registerUnprintableCodes('\u0085', new byte[]{(byte)0x23});
registerUnprintableCodes('\u0086', new byte[]{(byte)0x24});
registerUnprintableCodes('\u0087', new byte[]{(byte)0x25});
registerUnprintableCodes('\u0088', new byte[]{(byte)0x26});
registerUnprintableCodes('\u0089', new byte[]{(byte)0x27});
registerUnprintableCodes('\u008A', new byte[]{(byte)0x28});
registerUnprintableCodes('\u008B', new byte[]{(byte)0x29});
registerUnprintableCodes('\u008C', new byte[]{(byte)0x2A});
registerUnprintableCodes('\u008D', new byte[]{(byte)0x2B});
registerUnprintableCodes('\u008E', new byte[]{(byte)0x2C});
registerUnprintableCodes('\u008F', new byte[]{(byte)0x2D});
registerUnprintableCodes('\u0090', new byte[]{(byte)0x2E});
registerUnprintableCodes('\u0091', new byte[]{(byte)0x2F});
registerUnprintableCodes('\u0092', new byte[]{(byte)0x30});
registerUnprintableCodes('\u0093', new byte[]{(byte)0x31});
registerUnprintableCodes('\u0094', new byte[]{(byte)0x32});
registerUnprintableCodes('\u0095', new byte[]{(byte)0x33});
registerUnprintableCodes('\u0096', new byte[]{(byte)0x34});
registerUnprintableCodes('\u0097', new byte[]{(byte)0x35});
registerUnprintableCodes('\u0098', new byte[]{(byte)0x36});
registerUnprintableCodes('\u0099', new byte[]{(byte)0x37});
registerUnprintableCodes('\u009A', new byte[]{(byte)0x38});
registerUnprintableCodes('\u009B', new byte[]{(byte)0x39});
registerUnprintableCodes('\u009C', new byte[]{(byte)0x3A});
registerUnprintableCodes('\u009D', new byte[]{(byte)0x3B});
registerUnprintableCodes('\u009E', new byte[]{(byte)0x3C});
registerUnprintableCodes('\u009F', new byte[]{(byte)0x3D});
registerUnprintableCodes('\u00AD', new byte[]{(byte)0x83});

registerInternationalCodes('\u00AA', new byte[]{(byte)0x4A},
new byte[]{(byte)0x03});
registerInternationalCodes('\u00BA', new byte[]{(byte)0x64},
new byte[]{(byte)0x03});
registerInternationalCodes('\u00C0', new byte[]{(byte)0x4A},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00C1', new byte[]{(byte)0x4A},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00C2', new byte[]{(byte)0x4A},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00C3', new byte[]{(byte)0x4A},
new byte[]{(byte)0x19});
registerInternationalCodes('\u00C4', new byte[]{(byte)0x4A},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00C5', new byte[]{(byte)0x4A},
new byte[]{(byte)0x1A});
registerInternationalCodes('\u00C7', new byte[]{(byte)0x4D},
new byte[]{(byte)0x1C});
registerInternationalCodes('\u00C8', new byte[]{(byte)0x51},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00C9', new byte[]{(byte)0x51},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00CA', new byte[]{(byte)0x51},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00CB', new byte[]{(byte)0x51},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00CC', new byte[]{(byte)0x59},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00CD', new byte[]{(byte)0x59},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00CE', new byte[]{(byte)0x59},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00CF', new byte[]{(byte)0x59},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00D0', new byte[]{(byte)0x4F},
new byte[]{(byte)0x68});
registerInternationalCodes('\u00D1', new byte[]{(byte)0x62},
new byte[]{(byte)0x19});
registerInternationalCodes('\u00D2', new byte[]{(byte)0x64},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00D3', new byte[]{(byte)0x64},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00D4', new byte[]{(byte)0x64},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00D5', new byte[]{(byte)0x64},
new byte[]{(byte)0x19});
registerInternationalCodes('\u00D6', new byte[]{(byte)0x64},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00D8', new byte[]{(byte)0x64},
new byte[]{(byte)0x21});
registerInternationalCodes('\u00D9', new byte[]{(byte)0x6F},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00DA', new byte[]{(byte)0x6F},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00DB', new byte[]{(byte)0x6F},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00DC', new byte[]{(byte)0x6F},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00DD', new byte[]{(byte)0x76},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00E0', new byte[]{(byte)0x4A},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00E1', new byte[]{(byte)0x4A},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00E2', new byte[]{(byte)0x4A},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00E3', new byte[]{(byte)0x4A},
new byte[]{(byte)0x19});
registerInternationalCodes('\u00E4', new byte[]{(byte)0x4A},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00E5', new byte[]{(byte)0x4A},
new byte[]{(byte)0x1A});
registerInternationalCodes('\u00E7', new byte[]{(byte)0x4D},
new byte[]{(byte)0x1C});
registerInternationalCodes('\u00E8', new byte[]{(byte)0x51},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00E9', new byte[]{(byte)0x51},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00EA', new byte[]{(byte)0x51},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00EB', new byte[]{(byte)0x51},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00EC', new byte[]{(byte)0x59},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00ED', new byte[]{(byte)0x59},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00EE', new byte[]{(byte)0x59},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00EF', new byte[]{(byte)0x59},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00F0', new byte[]{(byte)0x4F},
new byte[]{(byte)0x68});
registerInternationalCodes('\u00F1', new byte[]{(byte)0x62},
new byte[]{(byte)0x19});
registerInternationalCodes('\u00F2', new byte[]{(byte)0x64},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00F3', new byte[]{(byte)0x64},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00F4', new byte[]{(byte)0x64},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00F5', new byte[]{(byte)0x64},
new byte[]{(byte)0x19});
registerInternationalCodes('\u00F6', new byte[]{(byte)0x64},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00F8', new byte[]{(byte)0x64},
new byte[]{(byte)0x21});
registerInternationalCodes('\u00F9', new byte[]{(byte)0x6F},
new byte[]{(byte)0x0F});
registerInternationalCodes('\u00FA', new byte[]{(byte)0x6F},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00FB', new byte[]{(byte)0x6F},
new byte[]{(byte)0x12});
registerInternationalCodes('\u00FC', new byte[]{(byte)0x6F},
new byte[]{(byte)0x13});
registerInternationalCodes('\u00FD', new byte[]{(byte)0x76},
new byte[]{(byte)0x0E});
registerInternationalCodes('\u00FF', new byte[]{(byte)0x76},
new byte[]{(byte)0x13});
static final byte INTERNATIONAL_EXTRA_PLACEHOLDER = (byte)0x02;

// see Index.writeCrazyCodes for details on writing crazy codes
static final byte CRAZY_CODE_START = (byte)0x80;
static final byte CRAZY_CODE_1 = (byte)0x02;
static final byte CRAZY_CODE_2 = (byte)0x03;
static final byte[] CRAZY_CODES_SUFFIX =
new byte[]{(byte)0xFF, (byte)0x02, (byte)0x80, (byte)0xFF, (byte)0x80};

// stash the codes in some resource files
private static final String CODES_FILE =
"com/healthmarketscience/jackcess/index_codes.txt";
private static final String EXT_CODES_FILE =
"com/healthmarketscience/jackcess/index_codes_ext.txt";

enum Type {
SIMPLE("S") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseSimpleCodes(codeStrings);
}
},
INTERNATIONAL("I") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseInternationalCodes(codeStrings);
}
},
UNPRINTABLE("U") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseUnprintableCodes(codeStrings);
}
},
UNPRINTABLE_EXT("P") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseUnprintableExtCodes(codeStrings);
}
},
INTERNATIONAL_EXT("Z") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseInternationalExtCodes(codeStrings);
}
},
IGNORED("X") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return IGNORED_CHAR_HANDLER;
}
};

private final String _prefixCode;

private Type(String prefixCode) {
_prefixCode = prefixCode;
}

public String getPrefixCode() {
return _prefixCode;
}

public abstract CharHandler parseCodes(String[] codeStrings);
}

private IndexCodes() {
abstract static class CharHandler {
public abstract Type getType();
public byte[] getInlineBytes() {
return null;
}
public byte[] getExtraBytes() {
return null;
}
public byte[] getUnprintableBytes() {
return null;
}
public byte getExtraByteModifier() {
return 0;
}
public byte getCrazyFlag() {
return 0;
}
}

private static void registerCodes(char c, byte[] codes) {
CODES.put(c, codes);
private static final class SimpleCharHandler extends CharHandler {
private byte[] _bytes;
private SimpleCharHandler(byte[] bytes) {
_bytes = bytes;
}
@Override public Type getType() {
return Type.SIMPLE;
}
@Override public byte[] getInlineBytes() {
return _bytes;
}
}
private static void registerUnprintableCodes(char c, byte[] codes) {
UNPRINTABLE_CODES.put(c, codes);

private static final class InternationalCharHandler extends CharHandler {
private byte[] _bytes;
private byte[] _extraBytes;
private InternationalCharHandler(byte[] bytes, byte[] extraBytes) {
_bytes = bytes;
_extraBytes = extraBytes;
}
@Override public Type getType() {
return Type.INTERNATIONAL;
}
@Override public byte[] getInlineBytes() {
return _bytes;
}
@Override public byte[] getExtraBytes() {
return _extraBytes;
}
}

private static final class UnprintableCharHandler extends CharHandler {
private byte[] _unprintBytes;
private UnprintableCharHandler(byte[] unprintBytes) {
_unprintBytes = unprintBytes;
}
@Override public Type getType() {
return Type.UNPRINTABLE;
}
@Override public byte[] getUnprintableBytes() {
return _unprintBytes;
}
}

private static final class UnprintableExtCharHandler extends CharHandler {
private byte _extraByteMod;
private UnprintableExtCharHandler(Byte extraByteMod) {
_extraByteMod = extraByteMod;
}
@Override public Type getType() {
return Type.UNPRINTABLE_EXT;
}
@Override public byte getExtraByteModifier() {
return _extraByteMod;
}
}

private static final class InternationalExtCharHandler extends CharHandler {
private byte[] _bytes;
private byte[] _extraBytes;
private byte _crazyFlag;
private InternationalExtCharHandler(byte[] bytes, byte[] extraBytes,
byte crazyFlag) {
_bytes = bytes;
_extraBytes = extraBytes;
_crazyFlag = crazyFlag;
}
@Override public Type getType() {
return Type.INTERNATIONAL_EXT;
}
@Override public byte[] getInlineBytes() {
return _bytes;
}
@Override public byte[] getExtraBytes() {
return _extraBytes;
}
@Override public byte getCrazyFlag() {
return _crazyFlag;
}
}

static final CharHandler IGNORED_CHAR_HANDLER = new CharHandler() {
@Override public Type getType() {
return Type.IGNORED;
}
};

static final CharHandler SURROGATE_CHAR_HANDLER = new CharHandler() {
@Override public Type getType() {
return Type.IGNORED;
}
@Override public byte[] getInlineBytes() {
throw new IllegalStateException(
"Surrogate pair chars are not handled");
}
};

private static final char FIRST_CHAR = (char)0x0000;
private static final char LAST_CHAR = (char)0x00FF;
private static final char FIRST_EXT_CHAR = LAST_CHAR + 1;
private static final char LAST_EXT_CHAR = (char)0xFFFF;

private static final class Codes
{
/** handlers for the first 256 chars. use nested class to lazy load the
handlers */
private static final CharHandler[] _values = loadCodes(
CODES_FILE, FIRST_CHAR, LAST_CHAR);
}
private static void registerInternationalCodes(
char c, byte[] inlineCodes, byte[] extraCodes) {
INTERNATIONAL_CODES.put(c,
new InternationalCodes(inlineCodes, extraCodes));
private static final class ExtCodes
{
/** handlers for the rest of the chars in BMP 0. use nested class to
lazy load the handlers */
private static final CharHandler[] _values = loadCodes(
EXT_CODES_FILE, FIRST_EXT_CHAR, LAST_EXT_CHAR);
}
private IndexCodes() {
}

static CharHandler getCharHandler(char c)
{
if(c <= LAST_CHAR) {
return Codes._values[c];
}

int extOffset = asUnsignedChar(c) - asUnsignedChar(FIRST_EXT_CHAR);
return ExtCodes._values[extOffset];
}

private static CharHandler[] loadCodes(String codesFilePath,
char firstChar, char lastChar)
{
int numCodes = (asUnsignedChar(lastChar) - asUnsignedChar(firstChar)) + 1;
CharHandler[] values = new CharHandler[numCodes];

Map<String,Type> prefixMap = new HashMap<String,Type>();
for(Type type : Type.values()) {
prefixMap.put(type.getPrefixCode(), type);
}

BufferedReader reader = null;
try {

reader = new BufferedReader(
new InputStreamReader(
Thread.currentThread().getContextClassLoader()
.getResourceAsStream(codesFilePath), "US-ASCII"));
int start = asUnsignedChar(firstChar);
int end = asUnsignedChar(lastChar);
for(int i = start; i <= end; ++i) {
char c = (char)i;
CharHandler ch = null;
if(Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
// surrogate chars are not included in the codes files
ch = SURROGATE_CHAR_HANDLER;
} else {
String codeLine = reader.readLine();
ch = parseCodes(prefixMap, codeLine);
}
values[(i - start)] = ch;
}

} catch(IOException e) {
throw new RuntimeException("failed loading index codes file " +
codesFilePath, e);
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException ex) {
// ignored
}
}
}

return values;
}

private static CharHandler parseCodes(Map<String,Type> prefixMap,
String codeLine)
{
String prefix = codeLine.substring(0, 1);
String suffix = ((codeLine.length() > 1) ? codeLine.substring(2) : "");
return prefixMap.get(prefix).parseCodes(suffix.split(",", -1));
}

private static CharHandler parseSimpleCodes(String[] codeStrings)
{
if(codeStrings.length != 1) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
return new SimpleCharHandler(codesToBytes(codeStrings[0], true));
}

private static CharHandler parseInternationalCodes(String[] codeStrings)
{
if(codeStrings.length != 2) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
return new InternationalCharHandler(codesToBytes(codeStrings[0], true),
codesToBytes(codeStrings[1], true));
}

private static CharHandler parseUnprintableCodes(String[] codeStrings)
{
if(codeStrings.length != 1) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
return new UnprintableCharHandler(codesToBytes(codeStrings[0], true));
}

private static CharHandler parseUnprintableExtCodes(String[] codeStrings)
{
if(codeStrings.length != 1) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
byte[] bytes = codesToBytes(codeStrings[0], true);
if(bytes.length != 1) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
return new UnprintableExtCharHandler(bytes[0]);
}

private static CharHandler parseInternationalExtCodes(String[] codeStrings)
{
if(codeStrings.length != 3) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}

byte crazyFlag = ("1".equals(codeStrings[2]) ?
CRAZY_CODE_1 : CRAZY_CODE_2);
return new InternationalExtCharHandler(codesToBytes(codeStrings[0], true),
codesToBytes(codeStrings[1], false),
crazyFlag);
}

private static byte[] codesToBytes(String codes, boolean required)
{
if(codes.length() == 0) {
if(required) {
throw new IllegalStateException("empty code bytes");
}
return null;
}
byte[] bytes = new byte[codes.length() / 2];
for(int i = 0; i < bytes.length; ++i) {
int charIdx = i*2;
bytes[i] = (byte)(Integer.parseInt(codes.substring(charIdx, charIdx + 2),
16));
}
return bytes;
}

private static int asUnsignedChar(char c)
{
return c & 0xFFFF;
}

static boolean isNullEntry(byte startEntryFlag) {
return((startEntryFlag == ASC_NULL_FLAG) ||
(startEntryFlag == DESC_NULL_FLAG));
@@ -448,14 +432,4 @@ public class IndexCodes {
return(isAscending ? ASC_START_FLAG : DESC_START_FLAG);
}
static final class InternationalCodes {
public final byte[] _inlineCodes;
public final byte[] _extraCodes;

private InternationalCodes(byte[] inlineCodes, byte[] extraCodes) {
_inlineCodes = inlineCodes;
_extraCodes = extraCodes;
}
}
}

+ 256
- 0
src/resources/com/healthmarketscience/jackcess/index_codes.txt View File

@@ -0,0 +1,256 @@
X
U,03
U,04
U,05
U,06
U,07
U,08
U,09
U,0A
S,0803
S,0804
S,0805
S,0806
S,0807
U,0B
U,0C
U,0D
U,0E
U,0F
U,10
U,11
U,12
U,13
U,14
U,15
U,16
U,17
U,18
U,19
U,1A
U,1B
U,1C
S,07
S,09
S,0A
S,0C
S,0E
S,10
S,12
U,80
S,14
S,16
S,18
S,2C
S,1A
U,82
S,1C
S,1E
S,36
S,38
S,3A
S,3C
S,3E
S,40
S,42
S,44
S,46
S,48
S,20
S,22
S,2E
S,30
S,32
S,24
S,26
S,4A
S,4C
S,4D
S,4F
S,51
S,53
S,55
S,57
S,59
S,5B
S,5C
S,5E
S,60
S,62
S,64
S,66
S,68
S,69
S,6B
S,6D
S,6F
S,71
S,73
S,75
S,76
S,78
S,27
S,29
S,2A
S,2B02
S,2B03
S,2B07
S,4A
S,4C
S,4D
S,4F
S,51
S,53
S,55
S,57
S,59
S,5B
S,5C
S,5E
S,60
S,62
S,64
S,66
S,68
S,69
S,6B
S,6D
S,6F
S,71
S,73
S,75
S,76
S,78
S,2B09
S,2B0B
S,2B0D
S,2B0F
U,1D
U,1E
U,1F
U,20
U,21
U,22
U,23
U,24
U,25
U,26
U,27
U,28
U,29
U,2A
U,2B
U,2C
U,2D
U,2E
U,2F
U,30
U,31
U,32
U,33
U,34
U,35
U,36
U,37
U,38
U,39
U,3A
U,3B
U,3C
U,3D
S,0802
S,2B10
S,34A6
S,34A7
S,34A8
S,34A9
S,2B11
S,34AA
S,2B12
S,34AB
I,4A,03
S,3305
S,34AC
U,83
S,34AD
S,2B13
S,34AE
S,3304
S,3A
S,3C
S,2B14
S,34AF
S,34B0
S,34B1
S,2B15
S,38
I,64,03
S,3307
S,3712
S,3716
S,371A
S,2B16
I,4A,0F
I,4A,0E
I,4A,12
I,4A,19
I,4A,13
I,4A,1A
S,4A51
I,4D,1C
I,51,0F
I,51,0E
I,51,12
I,51,13
I,59,0F
I,59,0E
I,59,12
I,59,13
I,4F,68
I,62,19
I,64,0F
I,64,0E
I,64,12
I,64,19
I,64,13
S,3309
I,64,21
I,6F,0F
I,6F,0E
I,6F,12
I,6F,13
I,76,0E
S,6D57
S,6B6B
I,4A,0F
I,4A,0E
I,4A,12
I,4A,19
I,4A,13
I,4A,1A
S,4A51
I,4D,1C
I,51,0F
I,51,0E
I,51,12
I,51,13
I,59,0F
I,59,0E
I,59,12
I,59,13
I,4F,68
I,62,19
I,64,0F
I,64,0E
I,64,12
I,64,19
I,64,13
S,330A
I,64,21
I,6F,0F
I,6F,0E
I,6F,12
I,6F,13
I,76,0E
S,6D57
I,76,13

+ 63232
- 0
src/resources/com/healthmarketscience/jackcess/index_codes_ext.txt
File diff suppressed because it is too large
View File


Loading…
Cancel
Save