From: James Ahlborn Date: Tue, 14 May 2024 01:42:22 +0000 (+0000) Subject: Add support for surrogate pairs in text indexes (e.g. emoticons), fixes #157 X-Git-Tag: jackcess-4.0.7~3 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=1c8a93bf9e0bac22ba14f3f50c1299e82f79b1e2;p=jackcess.git Add support for surrogate pairs in text indexes (e.g. emoticons), fixes #157 git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@1413 f203690c-595d-4dc9-a70b-905162fa7fd2 --- diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 3297bea..d6f54b0 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -4,6 +4,11 @@ Tim McCune + + + Add support for surrogate pairs in text indexes (e.g. emoticons). + + Make NumberFormatter locale aware, using the currently configured diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java b/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java index 77ebb67..a988a03 100644 --- a/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java +++ b/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java @@ -114,7 +114,7 @@ public class General97IndexCodes extends GeneralLegacyIndexCodes char c = str.charAt(i); CharHandler ch = getCharHandler(c); - byte[] bytes = ch.getInlineBytes(); + byte[] bytes = ch.getInlineBytes(c); if(bytes != null) { // write the "inline" codes immediately bout.write(bytes); diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java b/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java index 0d151e0..df3a781 100644 --- a/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java +++ b/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java @@ -103,6 +103,12 @@ public class GeneralLegacyIndexCodes { return parseSignificantCodes(codeStrings); } }, + SURROGATE("Q") { + @Override public CharHandler parseCodes(String[] codeStrings) { + // these are not parsed from the codes files + throw new UnsupportedOperationException(); + } + }, IGNORED("X") { @Override public CharHandler parseCodes(String[] codeStrings) { return IGNORED_CHAR_HANDLER; @@ -128,7 +134,7 @@ public class GeneralLegacyIndexCodes { */ abstract static class CharHandler { public abstract Type getType(); - public byte[] getInlineBytes() { + public byte[] getInlineBytes(char c) { return null; } public byte[] getExtraBytes() { @@ -159,7 +165,7 @@ public class GeneralLegacyIndexCodes { @Override public Type getType() { return Type.SIMPLE; } - @Override public byte[] getInlineBytes() { + @Override public byte[] getInlineBytes(char c) { return _bytes; } } @@ -177,7 +183,7 @@ public class GeneralLegacyIndexCodes { @Override public Type getType() { return Type.INTERNATIONAL; } - @Override public byte[] getInlineBytes() { + @Override public byte[] getInlineBytes(char c) { return _bytes; } @Override public byte[] getExtraBytes() { @@ -233,7 +239,7 @@ public class GeneralLegacyIndexCodes { @Override public Type getType() { return Type.INTERNATIONAL_EXT; } - @Override public byte[] getInlineBytes() { + @Override public byte[] getInlineBytes(char c) { return _bytes; } @Override public byte[] getExtraBytes() { @@ -255,7 +261,7 @@ public class GeneralLegacyIndexCodes { @Override public Type getType() { return Type.SIGNIFICANT; } - @Override public byte[] getInlineBytes() { + @Override public byte[] getInlineBytes(char c) { return _bytes; } @Override public boolean isSignificantChar() { @@ -270,17 +276,63 @@ public class GeneralLegacyIndexCodes { } }; - /** alternate shared CharHandler instance for "surrogate" chars (which we do - not handle) */ - static final CharHandler SURROGATE_CHAR_HANDLER = new CharHandler() { + /** the surrogate char bufs are computed on the fly. re-use a buffer for + those */ + private static final ThreadLocal SURROGATE_CHAR_BUF = + ThreadLocal.withInitial(() -> new byte[2]); + private static final byte[] SURROGATE_EXTRA_BYTES = {0x3f}; + + private static abstract class SurrogateCharHandler extends CharHandler { @Override public Type getType() { - return Type.IGNORED; + return Type.SURROGATE; } - @Override public byte[] getInlineBytes() { - throw new IllegalStateException( - "Surrogate pair chars are not handled"); + @Override public byte[] getExtraBytes() { + return SURROGATE_EXTRA_BYTES; } - }; + protected static byte[] toInlineBytes(int idxC) { + byte[] bytes = SURROGATE_CHAR_BUF.get(); + bytes[0] = (byte)((idxC >>> 8) & 0xFF); + bytes[1] = (byte)(idxC & 0xFF); + return bytes; + } + } + + /** shared CharHandler instance for "high surrogate" chars (which are + computed) */ + static final CharHandler HIGH_SURROGATE_CHAR_HANDLER = + new SurrogateCharHandler() { + @Override public byte[] getInlineBytes(char c) { + // the high sorrogate bytes seems to be computed from a fixed offset + int idxC = asUnsignedChar(c) - 10238; + return toInlineBytes(idxC); + } + }; + + /** shared CharHandler instance for "low surrogate" chars (which are + computed) */ + static final CharHandler LOW_SURROGATE_CHAR_HANDLER = + new SurrogateCharHandler() { + @Override public byte[] getInlineBytes(char c) { + // the low surrogate bytes are computed with a specific value based in + // its location in a 1024 character block. + int charOffset = (asUnsignedChar(c) - 0xdc00) % 1024; + + int idxOffset = 0; + if(charOffset < 8) { + idxOffset = 9992; + } else if(charOffset < (8 + 254)) { + idxOffset = 9990; + } else if(charOffset < (8 + 254 + 254)) { + idxOffset = 9988; + } else if(charOffset < (8 + 254 + 254 + 254)) { + idxOffset = 9986; + } else { + idxOffset = 9984; + } + int idxC = asUnsignedChar(c) - idxOffset; + return toInlineBytes(idxC); + } + }; static final char FIRST_CHAR = (char)0x0000; static final char LAST_CHAR = (char)0x00FF; @@ -349,9 +401,12 @@ public class GeneralLegacyIndexCodes { for(int i = start; i <= end; ++i) { char c = (char)i; CharHandler ch = null; - if(Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) { + if(Character.isHighSurrogate(c)) { // surrogate chars are not included in the codes files - ch = SURROGATE_CHAR_HANDLER; + ch = HIGH_SURROGATE_CHAR_HANDLER; + } else if(Character.isLowSurrogate(c)) { + // surrogate chars are not included in the codes files + ch = LOW_SURROGATE_CHAR_HANDLER; } else { String codeLine = reader.readLine(); ch = parseCodes(prefixMap, codeLine); @@ -529,7 +584,7 @@ public class GeneralLegacyIndexCodes { CharHandler ch = getCharHandler(c); int curCharOffset = charOffset; - byte[] bytes = ch.getInlineBytes(); + byte[] bytes = ch.getInlineBytes(c); if(bytes != null) { // write the "inline" codes immediately bout.write(bytes); @@ -667,7 +722,6 @@ public class GeneralLegacyIndexCodes { private static void writeExtraCodes( int charOffset, byte[] bytes, byte extraCodeModifier, ExtraCodesStream extraCodes) - throws IOException { // we fill in a placeholder value for any chars w/out extra codes int numChars = extraCodes.getNumChars(); @@ -712,7 +766,6 @@ public class GeneralLegacyIndexCodes { */ private static boolean trimExtraCodes(ByteStream extraCodes, byte minTrimCode, byte maxTrimCode) - throws IOException { if(extraCodes == null) { return false; @@ -730,7 +783,6 @@ public class GeneralLegacyIndexCodes { private static void writeUnprintableCodes( int charOffset, byte[] bytes, ByteStream unprintableCodes, ExtraCodesStream extraCodes) - throws IOException { // the offset seems to be calculated based on the number of bytes in the // "extra codes" part of the entry (even if there are no extra codes bytes @@ -764,7 +816,6 @@ public class GeneralLegacyIndexCodes { * Encode the given crazy code bytes into the given byte stream. */ private static void writeCrazyCodes(ByteStream crazyCodes, ByteStream bout) - throws IOException { // CRAZY_CODE_2 flags at the end are ignored, so ditch them trimExtraCodes(crazyCodes, CRAZY_CODE_2, CRAZY_CODE_2); diff --git a/src/test/data/V2010/testEmoticonsV2010.accdb b/src/test/data/V2010/testEmoticonsV2010.accdb new file mode 100755 index 0000000..8b70b95 Binary files /dev/null and b/src/test/data/V2010/testEmoticonsV2010.accdb differ diff --git a/src/test/java/com/healthmarketscience/jackcess/TestUtil.java b/src/test/java/com/healthmarketscience/jackcess/TestUtil.java index fae1fa0..7626ee4 100644 --- a/src/test/java/com/healthmarketscience/jackcess/TestUtil.java +++ b/src/test/java/com/healthmarketscience/jackcess/TestUtil.java @@ -361,16 +361,25 @@ public class TestUtil } static void dumpIndex(Index index) throws Exception { - dumpIndex(index, new PrintWriter(System.out, true)); + dumpIndex(index, Integer.MAX_VALUE); } - static void dumpIndex(Index index, PrintWriter writer) throws Exception { + static void dumpIndex(Index index, int limit) throws Exception { + dumpIndex(index, new PrintWriter(System.out, true), limit); + } + + static void dumpIndex(Index index, PrintWriter writer, int limit) + throws Exception { writer.println("INDEX: " + index); IndexData.EntryCursor ec = ((IndexImpl)index).cursor(); IndexData.Entry lastE = ec.getLastEntry(); IndexData.Entry e = null; + int count = 0; while((e = ec.getNextEntry()) != lastE) { writer.println(e); + if((count++) > limit) { + break; + } } } diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java b/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java index cfcca83..ce4bb52 100644 --- a/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java +++ b/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java @@ -64,7 +64,17 @@ public class IndexCodesTest extends TestCase { public void testIndexCodes() throws Exception { - for (final TestDB testDB : TestDB.getSupportedForBasename(Basename.INDEX_CODES, true)) { + doTestDb(Basename.INDEX_CODES); + } + + public void testEmoticons() throws Exception + { + doTestDb(Basename.EMOTICONS); + } + + private static void doTestDb(Basename dbBaseName) throws Exception + { + for (final TestDB testDB : TestDB.getSupportedForBasename(dbBaseName, true)) { Database db = openMem(testDB); db.setDateTimeType(DateTimeType.DATE); diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java b/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java index 0f6c889..2379891 100644 --- a/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java +++ b/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java @@ -58,7 +58,8 @@ public class JetFormatTest extends TestCase { CALC_FIELD("calcFieldTest"), BINARY_INDEX("binIdxTest"), OLD_DATES("oldDates"), - EXT_DATE("extDateTest"); + EXT_DATE("extDateTest"), + EMOTICONS("testEmoticons"); private final String _basename;