summaryrefslogtreecommitdiffstats
path: root/src/java/com/healthmarketscience/jackcess/Index.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/java/com/healthmarketscience/jackcess/Index.java')
-rw-r--r--src/java/com/healthmarketscience/jackcess/Index.java301
1 files changed, 218 insertions, 83 deletions
diff --git a/src/java/com/healthmarketscience/jackcess/Index.java b/src/java/com/healthmarketscience/jackcess/Index.java
index 2da1fc2..3bf9c8d 100644
--- a/src/java/com/healthmarketscience/jackcess/Index.java
+++ b/src/java/com/healthmarketscience/jackcess/Index.java
@@ -35,8 +35,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
-import java.util.Iterator;
-import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@@ -1045,110 +1043,96 @@ public abstract class Index implements Comparable<Index> {
}
// now, convert each character to a "code" of one or more bytes
- List<ExtraCodes> unprintableCodes = null;
- List<ExtraCodes> internationalCodes = null;
+ ExtraCodesOutputStream extraCodes = null;
+ ExtraCodesOutputStream unprintableCodes = null;
+ ExtraCodesOutputStream crazyCodes = null;
int charOffset = 0;
for(int i = 0; i < str.length(); ++i) {
+
char c = str.charAt(i);
- Character cKey = c;
+ CharHandler ch = getCharHandler(c);
- byte[] bytes = CODES.get(cKey);
+ int curCharOffset = charOffset;
+ byte[] bytes = ch.getInlineBytes();
if(bytes != null) {
- // simple case, write the codes we found
+ // write the "inline" codes immediately
tmpBout.write(bytes);
+
+ // only increment the charOffset for chars with inline codes
++charOffset;
- continue;
}
- bytes = UNPRINTABLE_CODES.get(cKey);
- if(bytes != null) {
- // we do not write anything to tmpBout
- if(bytes.length > 0) {
- if(unprintableCodes == null) {
- unprintableCodes = new LinkedList<ExtraCodes>();
- }
-
- // keep track of the extra codes for later
- unprintableCodes.add(new ExtraCodes(charOffset, bytes));
- }
-
- // note, we do _not_ increment the charOffset for unprintable chars
+ if(ch.getType() == Type.SIMPLE) {
+ // common case, skip further code handling
continue;
}
- InternationalCodes inatCodes = INTERNATIONAL_CODES.get(cKey);
- if(inatCodes != null) {
-
- // we write the "inline" portion of the international codes
- // immediately, and queue the extra codes for later
- tmpBout.write(inatCodes._inlineCodes);
-
- if(internationalCodes == null) {
- internationalCodes = new LinkedList<ExtraCodes>();
+ bytes = ch.getExtraBytes();
+ byte extraCodeModifier = ch.getExtraByteModifier();
+ if((bytes != null) || (extraCodeModifier != 0)) {
+ if(extraCodes == null) {
+ extraCodes = new ExtraCodesOutputStream(str.length());
}
// keep track of the extra codes for later
- internationalCodes.add(new ExtraCodes(charOffset,
- inatCodes._extraCodes));
+ writeExtraCodes(curCharOffset, bytes, extraCodeModifier, extraCodes);
+ }
- ++charOffset;
- continue;
+ bytes = ch.getUnprintableBytes();
+ if(bytes != null) {
+ if(unprintableCodes == null) {
+ unprintableCodes = new ExtraCodesOutputStream();
+ }
+
+ // keep track of the unprintable codes for later
+ writeUnprintableCodes(curCharOffset, bytes, unprintableCodes);
}
+
+ byte crazyFlag = ch.getCrazyFlag();
+ if(crazyFlag != 0) {
+ if(crazyCodes == null) {
+ crazyCodes = new ExtraCodesOutputStream();
+ }
- // bummer, out of luck
- throw new IOException("unmapped string index value " + c);
+ // keep track of the crazy flags for later
+ crazyCodes.write(crazyFlag);
+ }
}
// write end text flag
tmpBout.write(END_TEXT);
- boolean hasExtraText = ((unprintableCodes != null) ||
- (internationalCodes != null));
- if(hasExtraText) {
+ boolean hasExtraCodes = trimExtraCodes(
+ extraCodes, (byte)0, INTERNATIONAL_EXTRA_PLACEHOLDER);
+ boolean hasUnprintableCodes = (unprintableCodes != null);
+ boolean hasCrazyCodes = (crazyCodes != null);
+ if(hasExtraCodes || hasUnprintableCodes || hasCrazyCodes) {
// we write all the international extra bytes first
- if(internationalCodes != null) {
-
- // we write a placeholder char for each non-international char before
- // the extra chars for the international char
- charOffset = 0;
- Iterator<ExtraCodes> iter = internationalCodes.iterator();
- while(iter.hasNext()) {
- ExtraCodes extraCodes = iter.next();
- while(charOffset < extraCodes._charOffset) {
- tmpBout.write(INTERNATIONAL_EXTRA_PLACEHOLDER);
- ++charOffset;
- }
- tmpBout.write(extraCodes._extraCodes);
- ++charOffset;
- }
+ if(hasExtraCodes) {
+ extraCodes.writeTo(tmpBout);
}
- // then we write all the unprintable extra bytes
- if(unprintableCodes != null) {
+ if(hasCrazyCodes || hasUnprintableCodes) {
+
+ // write 2 more end flags
+ tmpBout.write(END_TEXT);
+ tmpBout.write(END_TEXT);
- // write a single prefix for all unprintable chars
- tmpBout.write(UNPRINTABLE_COMMON_PREFIX);
+ // next come the crazy flags
+ if(hasCrazyCodes) {
+ writeCrazyCodes(crazyCodes, tmpBout);
+ }
+
+ // then we write all the unprintable extra bytes
+ if(hasUnprintableCodes) {
+
+ // write another end flag
+ tmpBout.write(END_TEXT);
- // we write a whacky combo of bytes for each unprintable char which
- // includes a funky offset and extra char itself
- Iterator<ExtraCodes> iter = unprintableCodes.iterator();
- while(iter.hasNext()) {
- ExtraCodes extraCodes = iter.next();
- int offset =
- (UNPRINTABLE_COUNT_START +
- (UNPRINTABLE_COUNT_MULTIPLIER * extraCodes._charOffset))
- | UNPRINTABLE_OFFSET_FLAGS;
-
- // write offset as big-endian short
- tmpBout.write((offset >> 8) & 0xFF);
- tmpBout.write(offset & 0xFF);
-
- tmpBout.write(UNPRINTABLE_MIDFIX);
- tmpBout.write(extraCodes._extraCodes);
+ unprintableCodes.writeTo(tmpBout);
}
}
-
}
// handle descending order by inverting the bytes
@@ -1168,6 +1152,111 @@ public abstract class Index implements Comparable<Index> {
bout.write(END_EXTRA_TEXT);
}
+ private static void writeExtraCodes(
+ int charOffset, byte[] bytes, byte extraCodeModifier,
+ ExtraCodesOutputStream extraCodes)
+ throws IOException
+ {
+ // we fill in a placeholder value for any chars w/out extra codes
+ int numChars = extraCodes.getNumChars();
+ if(numChars < charOffset) {
+ int fillChars = charOffset - numChars;
+ extraCodes.write(fillChars, INTERNATIONAL_EXTRA_PLACEHOLDER);
+ extraCodes.incrementNumChars(fillChars);
+ }
+
+ if(bytes != null) {
+
+ // write the actual extra codes and update the number of chars
+ extraCodes.write(bytes);
+ extraCodes.incrementNumChars(1);
+
+ } else {
+
+ // the extra code modifier is added to the last extra code written. if
+ // there is no previous extra code, it is made the first extra code.
+ int lastIdx = extraCodes.size() - 1;
+ if(lastIdx >= 0) {
+ byte lastByte = extraCodes.get(lastIdx);
+ lastByte += extraCodeModifier;
+ extraCodes.set(lastIdx, lastByte);
+ } else {
+ extraCodes.write(extraCodeModifier);
+ }
+ }
+ }
+
+ private static boolean trimExtraCodes(ExtraCodesOutputStream extraCodes,
+ byte minTrimCode, byte maxTrimCode)
+ throws IOException
+ {
+ if(extraCodes == null) {
+ return false;
+ }
+
+ extraCodes.trimTrailingBytes(minTrimCode, maxTrimCode);
+
+ // anything left?
+ return (extraCodes.size() > 0);
+ }
+
+ private static void writeUnprintableCodes(
+ int charOffset, byte[] bytes, ExtraCodesOutputStream extraCodes)
+ throws IOException
+ {
+ // we write a whacky combo of bytes for each unprintable char which
+ // includes a funky offset and extra char itself
+ int offset =
+ (UNPRINTABLE_COUNT_START +
+ (UNPRINTABLE_COUNT_MULTIPLIER * charOffset))
+ | UNPRINTABLE_OFFSET_FLAGS;
+
+ // write offset as big-endian short
+ extraCodes.write((offset >> 8) & 0xFF);
+ extraCodes.write(offset & 0xFF);
+
+ extraCodes.write(UNPRINTABLE_MIDFIX);
+ extraCodes.write(bytes);
+ }
+
+ private static void writeCrazyCodes(ExtraCodesOutputStream crazyCodes,
+ ByteArrayOutputStream tmpBout)
+ throws IOException
+ {
+ // CRAZY_CODE_2 flags at the end are ignored, so ditch them
+ trimExtraCodes(crazyCodes, CRAZY_CODE_2, CRAZY_CODE_2);
+
+ if(crazyCodes.size() > 0) {
+
+ // the crazy codes get encoded into 6 bit sequences where each code is 2
+ // bits (where the first 2 bits in the byte are a common prefix).
+ byte curByte = CRAZY_CODE_START;
+ int idx = 0;
+ for(int i = 0; i < crazyCodes.size(); ++i) {
+ byte nextByte = crazyCodes.get(i);
+ nextByte <<= ((2 - idx) * 2);
+ curByte |= nextByte;
+
+ ++idx;
+ if(idx == 3) {
+ // write current byte and reset
+ tmpBout.write(curByte);
+ curByte = CRAZY_CODE_START;
+ idx = 0;
+ }
+ }
+
+ // write last byte
+ if(idx > 0) {
+ tmpBout.write(curByte);
+ }
+ }
+
+ // write crazy code suffix (note, we write this even if all the codes are
+ // trmmed
+ tmpBout.write(CRAZY_CODES_SUFFIX);
+ }
+
/**
* Creates one of the special index entries.
*/
@@ -2289,17 +2378,63 @@ public abstract class Index implements Comparable<Index> {
entries);
}
}
-
+
+
/**
- * Value struct used during Text index entry encoding.
+ * Extension of ByteArrayOutputStream which allows more complex access to
+ * the underlying bytes, as well as keeps track of an additional char count.
*/
- private static final class ExtraCodes {
- public final int _charOffset;
- public final byte[] _extraCodes;
+ private static final class ExtraCodesOutputStream
+ extends ByteArrayOutputStream
+ {
+ private int numChars;
+
+ private ExtraCodesOutputStream() {
+ super();
+ }
+
+ private ExtraCodesOutputStream(int length) {
+ super(length);
+ }
+
+ public int getNumChars() {
+ return numChars;
+ }
+
+ public byte get(int offset) {
+ return buf[offset];
+ }
+
+ public void set(int offset, byte b) {
+ buf[offset] = b;
+ }
+
+ public void write(int numBytes, byte b) {
+ for(int i = 0; i < numBytes; ++i) {
+ write(b);
+ }
+ }
+
+ public void incrementNumChars(int inc) {
+ numChars += inc;
+ }
+
+ public void trimTrailingBytes(byte minTrimCode, byte maxTrimCode)
+ {
+ int minTrim = ByteUtil.asUnsignedByte(minTrimCode);
+ int maxTrim = ByteUtil.asUnsignedByte(maxTrimCode);
+
+ int idx = count - 1;
+ while(idx >= 0) {
+ int val = ByteUtil.asUnsignedByte(get(idx));
+ if((val >= minTrim) && (val <= maxTrim)) {
+ --idx;
+ } else {
+ break;
+ }
+ }
- private ExtraCodes(int charOffset, byte[] extraCodes) {
- _charOffset = charOffset;
- _extraCodes = extraCodes;
+ count = idx + 1;
}
}