From aaf7449a84bf6533485afd612f586b1794d2821f Mon Sep 17 00:00:00 2001 From: James Ahlborn Date: Sun, 11 Aug 2019 18:44:45 +0000 Subject: reword extra code handling for gen 97 indexes git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/branches/a97_indexes@1310 f203690c-595d-4dc9-a70b-905162fa7fd2 --- .../jackcess/impl/ByteUtil.java | 9 +- .../jackcess/impl/General97IndexCodes.java | 189 +++++++++++++++++++-- .../jackcess/impl/GeneralLegacyIndexCodes.java | 142 ++++++---------- .../jackcess/index_codes_gen_97.txt | 148 ++++++++-------- 4 files changed, 309 insertions(+), 179 deletions(-) (limited to 'src') diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/ByteUtil.java b/src/main/java/com/healthmarketscience/jackcess/impl/ByteUtil.java index a989f2e..97f3d50 100644 --- a/src/main/java/com/healthmarketscience/jackcess/impl/ByteUtil.java +++ b/src/main/java/com/healthmarketscience/jackcess/impl/ByteUtil.java @@ -710,7 +710,10 @@ public final class ByteUtil { } protected void ensureNewCapacity(int numBytes) { - int newLength = _length + numBytes; + ensureCapacity(_length + numBytes); + } + + protected void ensureCapacity(int newLength) { if(newLength > _bytes.length) { byte[] temp = new byte[newLength * 2]; System.arraycopy(_bytes, 0, temp, 0, _length); @@ -744,6 +747,10 @@ public final class ByteUtil { _bytes[offset] = b; } + public void setBits(int offset, byte b) { + _bytes[offset] |= b; + } + public void writeFill(int length, byte b) { ensureNewCapacity(length); int oldLength = _length; diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java b/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java index 432a0a9..54c0b7c 100644 --- a/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java +++ b/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java @@ -36,6 +36,13 @@ public class General97IndexCodes extends GeneralLegacyIndexCodes private static final String EXT_MAPPINGS_FILE = DatabaseImpl.RESOURCE_PATH + "index_mappings_ext_gen_97.txt"; + // we only have a small range of extended chars which can mapped back into + // the valid chars + private static final char FIRST_MAP_CHAR = 338; + private static final char LAST_MAP_CHAR = 8482; + + private static final byte EXT_CODES_BOUNDS_NIBBLE = (byte)0x00; + private static final class Codes { /** handlers for the first 256 chars. use nested class to lazy load the @@ -46,14 +53,15 @@ public class General97IndexCodes extends GeneralLegacyIndexCodes private static final class ExtMappings { - /** mappings for the rest of the chars in BMP 0. use nested class to lazy - load the handlers. since these codes are for single byte encodings, - you would think you wou;dn't need any ext codes. however, some chars - in the extended range have corollaries in the single byte range. this - array holds the mappings from the ext range to the single byte range. - chars without mappings go to 0. */ + /** mappings for a small subset of the rest of the chars in BMP 0. use + nested class to lazy load the handlers. since these codes are for + single byte encodings, you would think you wouldn't need any ext + codes. however, some chars in the extended range have corollaries in + the single byte range. this array holds the mappings from the ext + range to the single byte range. chars without mappings go to 0 + (ignored). */ private static final short[] _values = loadMappings( - EXT_MAPPINGS_FILE, FIRST_EXT_CHAR, LAST_EXT_CHAR); + EXT_MAPPINGS_FILE, FIRST_MAP_CHAR, LAST_MAP_CHAR); } static final General97IndexCodes GEN_97_INSTANCE = new General97IndexCodes(); @@ -70,20 +78,113 @@ public class General97IndexCodes extends GeneralLegacyIndexCodes return Codes._values[c]; } + if((c < FIRST_MAP_CHAR) || (c > LAST_MAP_CHAR)) { + // outside the mapped range, ignored + return IGNORED_CHAR_HANDLER; + } + // some ext chars are equivalent to single byte chars. most chars have no // equivalent, and they map to 0 (which is an "ignored" char, so it all // works out) - int extOffset = asUnsignedChar(c) - asUnsignedChar(FIRST_EXT_CHAR); + int extOffset = asUnsignedChar(c) - asUnsignedChar(FIRST_MAP_CHAR); return Codes._values[ExtMappings._values[extOffset]]; } + /** + * Converts a 97 index value for a text column into the entry value (which + * is based on a variety of nifty codes). + */ @Override void writeNonNullIndexTextValue( Object value, ByteStream bout, boolean isAscending) throws IOException { - // use simplified format for 97 encoding - writeNonNull97IndexTextValue(value, bout, isAscending); + // first, convert to string + String str = ColumnImpl.toCharSequence(value).toString(); + + // all text columns (including memos) are only indexed up to the max + // number of chars in a VARCHAR column + if(str.length() > MAX_TEXT_INDEX_CHAR_LENGTH) { + str = str.substring(0, MAX_TEXT_INDEX_CHAR_LENGTH); + } + + // record previous entry length so we can do any post-processing + // necessary for this entry (handling descending) + int prevLength = bout.getLength(); + + // now, convert each character to a "code" of one or more bytes + NibbleStream extraCodes = null; + int sigCharCount = 0; + for(int i = 0; i < str.length(); ++i) { + + char c = str.charAt(i); + CharHandler ch = getCharHandler(c); + + byte[] bytes = ch.getInlineBytes(); + if(bytes != null) { + // write the "inline" codes immediately + bout.write(bytes); + } + + if(ch.getType() == Type.SIMPLE) { + // common case, skip further code handling + continue; + } + + if(ch.isSignificantChar()) { + ++sigCharCount; + // significant chars never have extra bytes + continue; + } + + bytes = ch.getExtraBytes(); + if(bytes != null) { + if(extraCodes == null) { + extraCodes = new NibbleStream(str.length()); + extraCodes.writeNibble(EXT_CODES_BOUNDS_NIBBLE); + } + + // keep track of the extra code for later + writeExtraCodes(sigCharCount, bytes, extraCodes); + sigCharCount = 0; + } + } + + // FIXME, how to handle extra codes for non ascending? + if(extraCodes != null) { + + extraCodes.writeNibble(EXT_CODES_BOUNDS_NIBBLE); + extraCodes.writeTo(bout); + + } else { + + // handle descending order by inverting the bytes + if(!isAscending) { + + // we actually write the end byte before flipping the bytes, and write + // another one after flipping + bout.write(END_EXTRA_TEXT); + + // flip the bytes that we have written thus far for this text value + IndexData.flipBytes(bout.getBytes(), prevLength, + (bout.getLength() - prevLength)); + } + + // write end extra text + bout.write(END_EXTRA_TEXT); + } + } + + private static void writeExtraCodes(int numSigChars, byte[] bytes, + NibbleStream extraCodes) + { + // need to fill in placeholder nibbles for any "significant" chars + if(numSigChars > 0) { + extraCodes.writeFillNibbles(numSigChars, INTERNATIONAL_EXTRA_PLACEHOLDER); + } + + // there should only ever be a single "extra" byte + extraCodes.writeNibble(bytes[0]); } static short[] loadMappings(String mappingsFilePath, @@ -124,4 +225,72 @@ public class General97IndexCodes extends GeneralLegacyIndexCodes return values; } + + /** + * Extension of ByteStream which enables writing individual nibbles. + */ + protected static final class NibbleStream extends ByteStream + { + private int _nibbleLen; + + protected NibbleStream(int length) { + super(length); + } + + private boolean nextIsHi() { + return (_nibbleLen % 2) == 0; + } + + private static int asLowNibble(int b) { + return (b & 0x0F); + } + + private static int asHiNibble(int b) { + return ((b << 4) & 0xF0); + } + + private void writeLowNibble(int b) { + int byteOff = _nibbleLen / 2; + setBits(byteOff, (byte)asLowNibble(b)); + } + + public void writeNibble(int b) { + + if(nextIsHi()) { + write(asHiNibble(b)); + } else { + writeLowNibble(b); + } + + ++_nibbleLen; + } + + public void writeFillNibbles(int length, byte b) { + + int newNibbleLen = _nibbleLen + length; + ensureCapacity((newNibbleLen + 1) / 2); + + if(!nextIsHi()) { + writeLowNibble(b); + --length; + } + + if(length > 1) { + byte doubleB = (byte)(asHiNibble(b) | asLowNibble(b)); + + do { + write(doubleB); + length -= 2; + } while(length > 1); + } + + if(length == 1) { + write(asHiNibble(b)); + } + + _nibbleLen = newNibbleLen; + } + + } + } diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java b/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java index 218f4d5..d8d763d 100644 --- a/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java +++ b/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java @@ -98,6 +98,11 @@ public class GeneralLegacyIndexCodes { return parseInternationalExtCodes(codeStrings); } }, + SIGNIFICANT("G") { + @Override public CharHandler parseCodes(String[] codeStrings) { + return parseSignificantCodes(codeStrings); + } + }, IGNORED("X") { @Override public CharHandler parseCodes(String[] codeStrings) { return IGNORED_CHAR_HANDLER; @@ -138,13 +143,16 @@ public class GeneralLegacyIndexCodes { public byte getCrazyFlag() { return 0; } + public boolean isSignificantChar() { + return false; + } } /** * CharHandler for Type.SIMPLE */ private static final class SimpleCharHandler extends CharHandler { - private byte[] _bytes; + private final byte[] _bytes; private SimpleCharHandler(byte[] bytes) { _bytes = bytes; } @@ -160,8 +168,8 @@ public class GeneralLegacyIndexCodes { * CharHandler for Type.INTERNATIONAL */ private static final class InternationalCharHandler extends CharHandler { - private byte[] _bytes; - private byte[] _extraBytes; + private final byte[] _bytes; + private final byte[] _extraBytes; private InternationalCharHandler(byte[] bytes, byte[] extraBytes) { _bytes = bytes; _extraBytes = extraBytes; @@ -181,7 +189,7 @@ public class GeneralLegacyIndexCodes { * CharHandler for Type.UNPRINTABLE */ private static final class UnprintableCharHandler extends CharHandler { - private byte[] _unprintBytes; + private final byte[] _unprintBytes; private UnprintableCharHandler(byte[] unprintBytes) { _unprintBytes = unprintBytes; } @@ -197,7 +205,7 @@ public class GeneralLegacyIndexCodes { * CharHandler for Type.UNPRINTABLE_EXT */ private static final class UnprintableExtCharHandler extends CharHandler { - private byte _extraByteMod; + private final byte _extraByteMod; private UnprintableExtCharHandler(Byte extraByteMod) { _extraByteMod = extraByteMod; } @@ -213,9 +221,9 @@ public class GeneralLegacyIndexCodes { * CharHandler for Type.INTERNATIONAL_EXT */ private static final class InternationalExtCharHandler extends CharHandler { - private byte[] _bytes; - private byte[] _extraBytes; - private byte _crazyFlag; + private final byte[] _bytes; + private final byte[] _extraBytes; + private final byte _crazyFlag; private InternationalExtCharHandler(byte[] bytes, byte[] extraBytes, byte crazyFlag) { _bytes = bytes; @@ -236,6 +244,25 @@ public class GeneralLegacyIndexCodes { } } + /** + * CharHandler for Type.SIGNIFICANT + */ + private static final class SignificantCharHandler extends CharHandler { + private final byte[] _bytes; + private SignificantCharHandler(byte[] bytes) { + _bytes = bytes; + } + @Override public Type getType() { + return Type.SIGNIFICANT; + } + @Override public byte[] getInlineBytes() { + return _bytes; + } + @Override public boolean isSignificantChar() { + return true; + } + } + /** shared CharHandler instance for Type.IGNORED */ static final CharHandler IGNORED_CHAR_HANDLER = new CharHandler() { @Override public Type getType() { @@ -429,6 +456,18 @@ public class GeneralLegacyIndexCodes { crazyFlag); } + /** + * Returns a SignificantCharHandler parsed from the given index code strings. + */ + private static CharHandler parseSignificantCodes(String[] codeStrings) + { + if(codeStrings.length != 1) { + throw new IllegalStateException("Unexpected code strings " + + Arrays.asList(codeStrings)); + } + return new SignificantCharHandler(codesToBytes(codeStrings[0], true)); + } + /** * Converts a string of hex encoded bytes to a byte[], optionally throwing * an exception if no codes are given. @@ -481,7 +520,7 @@ public class GeneralLegacyIndexCodes { str = str.substring(0, MAX_TEXT_INDEX_CHAR_LENGTH); } - // record pprevious entry length so we can do any post-processing + // record previous entry length so we can do any post-processing // necessary for this entry (handling descending) int prevLength = bout.getLength(); @@ -602,91 +641,6 @@ public class GeneralLegacyIndexCodes { bout.write(END_EXTRA_TEXT); } - /** - * Converts a 97 index value for a text column into the entry value (which - * is based on a variety of nifty codes). - */ - void writeNonNull97IndexTextValue( - Object value, ByteStream bout, boolean isAscending) - throws IOException - { - // NOTE, this should probably be in Gen97TextColumnDescriptor but it was - // easier to add here than make everything private non-private. - - // first, convert to string - String str = ColumnImpl.toCharSequence(value).toString(); - - // all text columns (including memos) are only indexed up to the max - // number of chars in a VARCHAR column - if(str.length() > MAX_TEXT_INDEX_CHAR_LENGTH) { - str = str.substring(0, MAX_TEXT_INDEX_CHAR_LENGTH); - } - - // record pprevious entry length so we can do any post-processing - // necessary for this entry (handling descending) - int prevLength = bout.getLength(); - - // now, convert each character to a "code" of one or more bytes - ExtraCodesStream extraCodes = null; - int charOffset = 0; - for(int i = 0; i < str.length(); ++i) { - - char c = str.charAt(i); - CharHandler ch = getCharHandler(c); - - int curCharOffset = charOffset; - byte[] bytes = ch.getInlineBytes(); - if(bytes != null) { - // write the "inline" codes immediately - bout.write(bytes); - - // only increment the charOffset for chars with inline codes - ++charOffset; - } - - if(ch.getType() == Type.SIMPLE) { - // common case, skip further code handling - continue; - } - - bytes = ch.getExtraBytes(); - byte extraCodeModifier = ch.getExtraByteModifier(); - if((bytes != null) || (extraCodeModifier != 0)) { - if(extraCodes == null) { - extraCodes = new ExtraCodesStream(str.length()); - } - - // keep track of the extra codes for later - writeExtraCodes(curCharOffset, bytes, extraCodeModifier, extraCodes); - } - } - - // FIXME, how to handle extra codes for non ascending? - boolean hasExtraCodes = trimExtraCodes( - extraCodes, (byte)0, INTERNATIONAL_EXTRA_PLACEHOLDER); - if(hasExtraCodes) { - - extraCodes.writeTo(bout); - - } else { - - // handle descending order by inverting the bytes - if(!isAscending) { - - // we actually write the end byte before flipping the bytes, and write - // another one after flipping - bout.write(END_EXTRA_TEXT); - - // flip the bytes that we have written thus far for this text value - IndexData.flipBytes(bout.getBytes(), prevLength, - (bout.getLength() - prevLength)); - } - - // write end extra text - bout.write(END_EXTRA_TEXT); - } - } - /** * Encodes the given extra code info in the given stream. */ diff --git a/src/main/resources/com/healthmarketscience/jackcess/index_codes_gen_97.txt b/src/main/resources/com/healthmarketscience/jackcess/index_codes_gen_97.txt index 1604014..9723c05 100644 --- a/src/main/resources/com/healthmarketscience/jackcess/index_codes_gen_97.txt +++ b/src/main/resources/com/healthmarketscience/jackcess/index_codes_gen_97.txt @@ -63,31 +63,31 @@ S24 S25 S26 S27 -S60 +G60 S61 -S62 +G62 S64 -S66 +G66 S67 S68 S69 -S6A +G6A S6B S6C S6D S6F -S70 -S72 +G70 +G72 S73 S74 S75 -S76 +G76 S77 -S78 +G78 S7A S7B S7C -S7D +G7D S7E S28 S29 @@ -95,31 +95,31 @@ S2A S2B S2C S2D -S60 +G60 S61 -S62 +G62 S64 -S66 +G66 S67 S68 S69 -S6A +G6A S6B S6C S6D S6F -S70 -S72 +G70 +G72 S73 S74 S75 -S76 +G76 S77 -S78 +G78 S7A S7B S7C -S7D +G7D S7E S2E S2F @@ -136,7 +136,7 @@ S34 S35 S36 S37 -I76,A0 +I76,0A S18 S7266 S10 @@ -152,12 +152,12 @@ S1E S1E S39 S3A -I76,A0 +I76,0A S18 S7266 S10 S10 -I7D,60 +I7D,06 S11 S3B S3C @@ -190,67 +190,67 @@ S50 S51 S52 S53 -I60,30 -I60,40 -I60,50 -I60,70 -I60,60 -I60,80 +I60,03 +I60,04 +I60,05 +I60,07 +I60,06 +I60,08 S6066 -I62,90 -I66,30 -I66,40 -I66,50 -I66,60 -I6A,30 -I6A,40 -I6A,50 -I6A,60 +I62,09 +I66,03 +I66,04 +I66,05 +I66,06 +I6A,03 +I6A,04 +I6A,05 +I6A,06 S65 -I70,70 -I72,30 -I72,40 -I72,50 -I72,70 -I72,60 +I70,07 +I72,03 +I72,04 +I72,05 +I72,07 +I72,06 S54 S81 -I78,30 -I78,40 -I78,50 -I78,60 -I7D,40 +I78,03 +I78,04 +I78,05 +I78,06 +I7D,04 S7F S7676 -I60,30 -I60,40 -I60,50 -I60,70 -I60,60 -I60,80 +I60,03 +I60,04 +I60,05 +I60,07 +I60,06 +I60,08 S6066 -I62,90 -I66,30 -I66,40 -I66,50 -I66,60 -I6A,30 -I6A,40 -I6A,50 -I6A,60 +I62,09 +I66,03 +I66,04 +I66,05 +I66,06 +I6A,03 +I6A,04 +I6A,05 +I6A,06 S65 -I70,70 -I72,30 -I72,40 -I72,50 -I72,70 -I72,60 +I70,07 +I72,03 +I72,04 +I72,05 +I72,07 +I72,06 S55 S81 -I78,30 -I78,40 -I78,50 -I78,60 -I7D,40 +I78,03 +I78,04 +I78,05 +I78,06 +I7D,04 S7F -I7D,60 +I7D,06 -- cgit v1.2.3