Add support for surrogate pairs in text indexes (e.g. emoticons), fixes #157

author James Ahlborn <jtahlborn@yahoo.com>

Tue, 14 May 2024 01:42:22 +0000 (01:42 +0000)

committer James Ahlborn <jtahlborn@yahoo.com>

Tue, 14 May 2024 01:42:22 +0000 (01:42 +0000)
author James Ahlborn <jtahlborn@yahoo.com>
Tue, 14 May 2024 01:42:22 +0000 (01:42 +0000)
committer James Ahlborn <jtahlborn@yahoo.com>
Tue, 14 May 2024 01:42:22 +0000 (01:42 +0000)
diff --git a/src/changes/changes.xml b/src/changes/changes.xml

index 3297bead85e2b929b206dfb84e9410190969ca4a..d6f54b058e7b0ce7eb2e1e29a6b0db94385c3479 100644 (file)
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -4,6 +4,11 @@
      <author email="javajedi@users.sf.net">Tim McCune</author>
    </properties>
    <body>
+    <release version="4.0.7" date="TBD">
+      <action dev="jahlborn" type="fix" system="SourceForge2" issue="157">
+        Add support for surrogate pairs in text indexes (e.g. emoticons).
+      </action>
+    </release>
      <release version="4.0.6" date="2024-05-11">
        <action dev="jahlborn" type="update">
          Make NumberFormatter locale aware, using the currently configured
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java b/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java

index 77ebb67d6490c4ad6e3c112ca23b885cc13d8fab..a988a03c08eccbbb5827bc08f094b02d12a6a88a 100644 (file)
--- a/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java
+++ b/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java
@@ -114,7 +114,7 @@ public class General97IndexCodes extends GeneralLegacyIndexCodes
        char c = str.charAt(i);
        CharHandler ch = getCharHandler(c);
  
-      byte[] bytes = ch.getInlineBytes();
+      byte[] bytes = ch.getInlineBytes(c);
        if(bytes != null) {
          // write the "inline" codes immediately
          bout.write(bytes);
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java b/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java

index 0d151e06c5b4a71f41a0a0902673ef4c13c20bad..df3a781c7bc57fb5ba105f7271349bc1da03d04a 100644 (file)
--- a/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java
+++ b/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java
@@ -103,6 +103,12 @@ public class GeneralLegacyIndexCodes {
          return parseSignificantCodes(codeStrings);
        }
      },
+    SURROGATE("Q") {
+      @Override public CharHandler parseCodes(String[] codeStrings) {
+        // these are not parsed from the codes files
+        throw new UnsupportedOperationException();
+      }
+    },
      IGNORED("X") {
        @Override public CharHandler parseCodes(String[] codeStrings) {
          return IGNORED_CHAR_HANDLER;
@@ -128,7 +134,7 @@ public class GeneralLegacyIndexCodes {
     */
    abstract static class CharHandler {
      public abstract Type getType();
-    public byte[] getInlineBytes() {
+    public byte[] getInlineBytes(char c) {
        return null;
      }
      public byte[] getExtraBytes() {
@@ -159,7 +165,7 @@ public class GeneralLegacyIndexCodes {
      @Override public Type getType() {
        return Type.SIMPLE;
      }
-    @Override public byte[] getInlineBytes() {
+    @Override public byte[] getInlineBytes(char c) {
        return _bytes;
      }
    }
@@ -177,7 +183,7 @@ public class GeneralLegacyIndexCodes {
      @Override public Type getType() {
        return Type.INTERNATIONAL;
      }
-    @Override public byte[] getInlineBytes() {
+    @Override public byte[] getInlineBytes(char c) {
        return _bytes;
      }
      @Override public byte[] getExtraBytes() {
@@ -233,7 +239,7 @@ public class GeneralLegacyIndexCodes {
      @Override public Type getType() {
        return Type.INTERNATIONAL_EXT;
      }
-    @Override public byte[] getInlineBytes() {
+    @Override public byte[] getInlineBytes(char c) {
        return _bytes;
      }
      @Override public byte[] getExtraBytes() {
@@ -255,7 +261,7 @@ public class GeneralLegacyIndexCodes {
      @Override public Type getType() {
        return Type.SIGNIFICANT;
      }
-    @Override public byte[] getInlineBytes() {
+    @Override public byte[] getInlineBytes(char c) {
        return _bytes;
      }
      @Override public boolean isSignificantChar() {
@@ -270,17 +276,63 @@ public class GeneralLegacyIndexCodes {
      }
    };
  
-  /** alternate shared CharHandler instance for "surrogate" chars (which we do
-      not handle) */
-  static final CharHandler SURROGATE_CHAR_HANDLER = new CharHandler() {
+  /** the surrogate char bufs are computed on the fly.  re-use a buffer for
+      those */
+  private static final ThreadLocal<byte[]> SURROGATE_CHAR_BUF =
+    ThreadLocal.withInitial(() -> new byte[2]);
+  private static final byte[] SURROGATE_EXTRA_BYTES = {0x3f};
+
+  private static abstract class SurrogateCharHandler extends CharHandler {
      @Override public Type getType() {
-      return Type.IGNORED;
+      return Type.SURROGATE;
      }
-    @Override public byte[] getInlineBytes() {
-      throw new IllegalStateException(
-          "Surrogate pair chars are not handled");
+    @Override public byte[] getExtraBytes() {
+      return SURROGATE_EXTRA_BYTES;
      }
-  };
+    protected static byte[] toInlineBytes(int idxC) {
+      byte[] bytes = SURROGATE_CHAR_BUF.get();
+      bytes[0] = (byte)((idxC >>> 8) & 0xFF);
+      bytes[1] = (byte)(idxC & 0xFF);
+      return bytes;
+    }
+  }
+
+  /** shared CharHandler instance for "high surrogate" chars (which are
+      computed) */
+  static final CharHandler HIGH_SURROGATE_CHAR_HANDLER =
+    new SurrogateCharHandler() {
+      @Override public byte[] getInlineBytes(char c) {
+        // the high sorrogate bytes seems to be computed from a fixed offset
+        int idxC = asUnsignedChar(c) - 10238;
+        return toInlineBytes(idxC);
+      }
+    };
+
+  /** shared CharHandler instance for "low surrogate" chars (which are
+      computed) */
+  static final CharHandler LOW_SURROGATE_CHAR_HANDLER =
+    new SurrogateCharHandler() {
+      @Override public byte[] getInlineBytes(char c) {
+        // the low surrogate bytes are computed with a specific value based in
+        // its location in a 1024 character block.
+        int charOffset = (asUnsignedChar(c) - 0xdc00) % 1024;
+
+        int idxOffset = 0;
+        if(charOffset < 8) {
+          idxOffset = 9992;
+        } else if(charOffset < (8 + 254)) {
+          idxOffset = 9990;
+        } else if(charOffset < (8 + 254 + 254)) {
+          idxOffset = 9988;
+        } else if(charOffset < (8 + 254 + 254 + 254)) {
+          idxOffset = 9986;
+        } else  {
+          idxOffset = 9984;
+        }
+        int idxC = asUnsignedChar(c) - idxOffset;
+        return toInlineBytes(idxC);
+      }
+    };
  
    static final char FIRST_CHAR = (char)0x0000;
    static final char LAST_CHAR = (char)0x00FF;
@@ -349,9 +401,12 @@ public class GeneralLegacyIndexCodes {
        for(int i = start; i <= end; ++i) {
          char c = (char)i;
          CharHandler ch = null;
-        if(Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
+        if(Character.isHighSurrogate(c)) {
            // surrogate chars are not included in the codes files
-          ch = SURROGATE_CHAR_HANDLER;
+          ch = HIGH_SURROGATE_CHAR_HANDLER;
+        } else if(Character.isLowSurrogate(c)) {
+          // surrogate chars are not included in the codes files
+          ch = LOW_SURROGATE_CHAR_HANDLER;
          } else {
            String codeLine = reader.readLine();
            ch = parseCodes(prefixMap, codeLine);
@@ -529,7 +584,7 @@ public class GeneralLegacyIndexCodes {
        CharHandler ch = getCharHandler(c);
  
        int curCharOffset = charOffset;
-      byte[] bytes = ch.getInlineBytes();
+      byte[] bytes = ch.getInlineBytes(c);
        if(bytes != null) {
          // write the "inline" codes immediately
          bout.write(bytes);
@@ -667,7 +722,6 @@ public class GeneralLegacyIndexCodes {
    private static void writeExtraCodes(
        int charOffset, byte[] bytes, byte extraCodeModifier,
        ExtraCodesStream extraCodes)
-    throws IOException
    {
      // we fill in a placeholder value for any chars w/out extra codes
      int numChars = extraCodes.getNumChars();
@@ -712,7 +766,6 @@ public class GeneralLegacyIndexCodes {
     */
    private static boolean trimExtraCodes(ByteStream extraCodes,
                                          byte minTrimCode, byte maxTrimCode)
-    throws IOException
    {
      if(extraCodes == null) {
        return false;
@@ -730,7 +783,6 @@ public class GeneralLegacyIndexCodes {
    private static void writeUnprintableCodes(
        int charOffset, byte[] bytes, ByteStream unprintableCodes,
        ExtraCodesStream extraCodes)
-    throws IOException
    {
      // the offset seems to be calculated based on the number of bytes in the
      // "extra codes" part of the entry (even if there are no extra codes bytes
@@ -764,7 +816,6 @@ public class GeneralLegacyIndexCodes {
     * Encode the given crazy code bytes into the given byte stream.
     */
    private static void writeCrazyCodes(ByteStream crazyCodes, ByteStream bout)
-    throws IOException
    {
      // CRAZY_CODE_2 flags at the end are ignored, so ditch them
      trimExtraCodes(crazyCodes, CRAZY_CODE_2, CRAZY_CODE_2);
diff --git a/src/test/data/V2010/testEmoticonsV2010.accdb b/src/test/data/V2010/testEmoticonsV2010.accdb

new file mode 100755 (executable)

index 0000000..8b70b95

Binary files /dev/null and b/src/test/data/V2010/testEmoticonsV2010.accdb differ
diff --git a/src/test/java/com/healthmarketscience/jackcess/TestUtil.java b/src/test/java/com/healthmarketscience/jackcess/TestUtil.java

index fae1fa0733a8c66ee766d7923fafab832c4f1e91..7626ee4fb3e6390dd81d1c0e9e7dd207859d2ea5 100644 (file)
--- a/src/test/java/com/healthmarketscience/jackcess/TestUtil.java
+++ b/src/test/java/com/healthmarketscience/jackcess/TestUtil.java
@@ -361,16 +361,25 @@ public class TestUtil
    }
  
    static void dumpIndex(Index index) throws Exception {
-    dumpIndex(index, new PrintWriter(System.out, true));
+    dumpIndex(index, Integer.MAX_VALUE);
    }
  
-  static void dumpIndex(Index index, PrintWriter writer) throws Exception {
+  static void dumpIndex(Index index, int limit) throws Exception {
+    dumpIndex(index, new PrintWriter(System.out, true), limit);
+  }
+
+  static void dumpIndex(Index index, PrintWriter writer, int limit)
+    throws Exception {
      writer.println("INDEX: " + index);
      IndexData.EntryCursor ec = ((IndexImpl)index).cursor();
      IndexData.Entry lastE = ec.getLastEntry();
      IndexData.Entry e = null;
+    int count = 0;
      while((e = ec.getNextEntry()) != lastE) {
        writer.println(e);
+      if((count++) > limit) {
+        break;
+      }
      }
    }
  
diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java b/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java

index cfcca8339f49a0853ca278fdab23592cf8e87082..ce4bb5210b761b68cd88be826ef596a12ecb5d2c 100644 (file)
--- a/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java
+++ b/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java
@@ -64,7 +64,17 @@ public class IndexCodesTest extends TestCase {
  
    public void testIndexCodes() throws Exception
    {
-    for (final TestDB testDB : TestDB.getSupportedForBasename(Basename.INDEX_CODES, true)) {
+    doTestDb(Basename.INDEX_CODES);
+  }
+
+  public void testEmoticons() throws Exception
+  {
+    doTestDb(Basename.EMOTICONS);
+  }
+
+  private static void doTestDb(Basename dbBaseName) throws Exception
+  {
+    for (final TestDB testDB : TestDB.getSupportedForBasename(dbBaseName, true)) {
        Database db = openMem(testDB);
        db.setDateTimeType(DateTimeType.DATE);
  
diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java b/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java

index 0f6c88918cb25692f65af8f6fe555e6f30475453..2379891f5b6e09dd77afd599d57ce23c7422c73a 100644 (file)
--- a/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java
+++ b/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java
@@ -58,7 +58,8 @@ public class JetFormatTest extends TestCase {
      CALC_FIELD("calcFieldTest"),
      BINARY_INDEX("binIdxTest"),
      OLD_DATES("oldDates"),
-    EXT_DATE("extDateTest");
+    EXT_DATE("extDateTest"),
+    EMOTICONS("testEmoticons");
  
      private final String _basename;
author	James Ahlborn <jtahlborn@yahoo.com>
	Tue, 14 May 2024 01:42:22 +0000 (01:42 +0000)
committer	James Ahlborn <jtahlborn@yahoo.com>
	Tue, 14 May 2024 01:42:22 +0000 (01:42 +0000)
src/changes/changes.xml		patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java		patch \| blob \| history
src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java		patch \| blob \| history
src/test/data/V2010/testEmoticonsV2010.accdb	[new file with mode: 0755]	patch \| blob
src/test/java/com/healthmarketscience/jackcess/TestUtil.java		patch \| blob \| history
src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java		patch \| blob \| history
src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java		patch \| blob \| history