aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Ahlborn <jtahlborn@yahoo.com>2024-05-14 01:42:22 +0000
committerJames Ahlborn <jtahlborn@yahoo.com>2024-05-14 01:42:22 +0000
commit1c8a93bf9e0bac22ba14f3f50c1299e82f79b1e2 (patch)
tree0a613b8544af2888d4cfcbbce8a58269a4d1b5f0
parent0bacfe48446d629d43d0d735c5c9583b877f60d5 (diff)
downloadjackcess-1c8a93bf9e0bac22ba14f3f50c1299e82f79b1e2.tar.gz
jackcess-1c8a93bf9e0bac22ba14f3f50c1299e82f79b1e2.zip
Add support for surrogate pairs in text indexes (e.g. emoticons), fixes #157
git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@1413 f203690c-595d-4dc9-a70b-905162fa7fd2
-rw-r--r--src/changes/changes.xml5
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java2
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java91
-rwxr-xr-xsrc/test/data/V2010/testEmoticonsV2010.accdbbin0 -> 385024 bytes
-rw-r--r--src/test/java/com/healthmarketscience/jackcess/TestUtil.java13
-rw-r--r--src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java12
-rw-r--r--src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java3
7 files changed, 101 insertions, 25 deletions
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 3297bea..d6f54b0 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -4,6 +4,11 @@
<author email="javajedi@users.sf.net">Tim McCune</author>
</properties>
<body>
+ <release version="4.0.7" date="TBD">
+ <action dev="jahlborn" type="fix" system="SourceForge2" issue="157">
+ Add support for surrogate pairs in text indexes (e.g. emoticons).
+ </action>
+ </release>
<release version="4.0.6" date="2024-05-11">
<action dev="jahlborn" type="update">
Make NumberFormatter locale aware, using the currently configured
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java b/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java
index 77ebb67..a988a03 100644
--- a/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java
+++ b/src/main/java/com/healthmarketscience/jackcess/impl/General97IndexCodes.java
@@ -114,7 +114,7 @@ public class General97IndexCodes extends GeneralLegacyIndexCodes
char c = str.charAt(i);
CharHandler ch = getCharHandler(c);
- byte[] bytes = ch.getInlineBytes();
+ byte[] bytes = ch.getInlineBytes(c);
if(bytes != null) {
// write the "inline" codes immediately
bout.write(bytes);
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java b/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java
index 0d151e0..df3a781 100644
--- a/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java
+++ b/src/main/java/com/healthmarketscience/jackcess/impl/GeneralLegacyIndexCodes.java
@@ -103,6 +103,12 @@ public class GeneralLegacyIndexCodes {
return parseSignificantCodes(codeStrings);
}
},
+ SURROGATE("Q") {
+ @Override public CharHandler parseCodes(String[] codeStrings) {
+ // these are not parsed from the codes files
+ throw new UnsupportedOperationException();
+ }
+ },
IGNORED("X") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return IGNORED_CHAR_HANDLER;
@@ -128,7 +134,7 @@ public class GeneralLegacyIndexCodes {
*/
abstract static class CharHandler {
public abstract Type getType();
- public byte[] getInlineBytes() {
+ public byte[] getInlineBytes(char c) {
return null;
}
public byte[] getExtraBytes() {
@@ -159,7 +165,7 @@ public class GeneralLegacyIndexCodes {
@Override public Type getType() {
return Type.SIMPLE;
}
- @Override public byte[] getInlineBytes() {
+ @Override public byte[] getInlineBytes(char c) {
return _bytes;
}
}
@@ -177,7 +183,7 @@ public class GeneralLegacyIndexCodes {
@Override public Type getType() {
return Type.INTERNATIONAL;
}
- @Override public byte[] getInlineBytes() {
+ @Override public byte[] getInlineBytes(char c) {
return _bytes;
}
@Override public byte[] getExtraBytes() {
@@ -233,7 +239,7 @@ public class GeneralLegacyIndexCodes {
@Override public Type getType() {
return Type.INTERNATIONAL_EXT;
}
- @Override public byte[] getInlineBytes() {
+ @Override public byte[] getInlineBytes(char c) {
return _bytes;
}
@Override public byte[] getExtraBytes() {
@@ -255,7 +261,7 @@ public class GeneralLegacyIndexCodes {
@Override public Type getType() {
return Type.SIGNIFICANT;
}
- @Override public byte[] getInlineBytes() {
+ @Override public byte[] getInlineBytes(char c) {
return _bytes;
}
@Override public boolean isSignificantChar() {
@@ -270,17 +276,63 @@ public class GeneralLegacyIndexCodes {
}
};
- /** alternate shared CharHandler instance for "surrogate" chars (which we do
- not handle) */
- static final CharHandler SURROGATE_CHAR_HANDLER = new CharHandler() {
+ /** the surrogate char bufs are computed on the fly. re-use a buffer for
+ those */
+ private static final ThreadLocal<byte[]> SURROGATE_CHAR_BUF =
+ ThreadLocal.withInitial(() -> new byte[2]);
+ private static final byte[] SURROGATE_EXTRA_BYTES = {0x3f};
+
+ private static abstract class SurrogateCharHandler extends CharHandler {
@Override public Type getType() {
- return Type.IGNORED;
+ return Type.SURROGATE;
}
- @Override public byte[] getInlineBytes() {
- throw new IllegalStateException(
- "Surrogate pair chars are not handled");
+ @Override public byte[] getExtraBytes() {
+ return SURROGATE_EXTRA_BYTES;
}
- };
+ protected static byte[] toInlineBytes(int idxC) {
+ byte[] bytes = SURROGATE_CHAR_BUF.get();
+ bytes[0] = (byte)((idxC >>> 8) & 0xFF);
+ bytes[1] = (byte)(idxC & 0xFF);
+ return bytes;
+ }
+ }
+
+ /** shared CharHandler instance for "high surrogate" chars (which are
+ computed) */
+ static final CharHandler HIGH_SURROGATE_CHAR_HANDLER =
+ new SurrogateCharHandler() {
+ @Override public byte[] getInlineBytes(char c) {
+ // the high sorrogate bytes seems to be computed from a fixed offset
+ int idxC = asUnsignedChar(c) - 10238;
+ return toInlineBytes(idxC);
+ }
+ };
+
+ /** shared CharHandler instance for "low surrogate" chars (which are
+ computed) */
+ static final CharHandler LOW_SURROGATE_CHAR_HANDLER =
+ new SurrogateCharHandler() {
+ @Override public byte[] getInlineBytes(char c) {
+ // the low surrogate bytes are computed with a specific value based in
+ // its location in a 1024 character block.
+ int charOffset = (asUnsignedChar(c) - 0xdc00) % 1024;
+
+ int idxOffset = 0;
+ if(charOffset < 8) {
+ idxOffset = 9992;
+ } else if(charOffset < (8 + 254)) {
+ idxOffset = 9990;
+ } else if(charOffset < (8 + 254 + 254)) {
+ idxOffset = 9988;
+ } else if(charOffset < (8 + 254 + 254 + 254)) {
+ idxOffset = 9986;
+ } else {
+ idxOffset = 9984;
+ }
+ int idxC = asUnsignedChar(c) - idxOffset;
+ return toInlineBytes(idxC);
+ }
+ };
static final char FIRST_CHAR = (char)0x0000;
static final char LAST_CHAR = (char)0x00FF;
@@ -349,9 +401,12 @@ public class GeneralLegacyIndexCodes {
for(int i = start; i <= end; ++i) {
char c = (char)i;
CharHandler ch = null;
- if(Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
+ if(Character.isHighSurrogate(c)) {
// surrogate chars are not included in the codes files
- ch = SURROGATE_CHAR_HANDLER;
+ ch = HIGH_SURROGATE_CHAR_HANDLER;
+ } else if(Character.isLowSurrogate(c)) {
+ // surrogate chars are not included in the codes files
+ ch = LOW_SURROGATE_CHAR_HANDLER;
} else {
String codeLine = reader.readLine();
ch = parseCodes(prefixMap, codeLine);
@@ -529,7 +584,7 @@ public class GeneralLegacyIndexCodes {
CharHandler ch = getCharHandler(c);
int curCharOffset = charOffset;
- byte[] bytes = ch.getInlineBytes();
+ byte[] bytes = ch.getInlineBytes(c);
if(bytes != null) {
// write the "inline" codes immediately
bout.write(bytes);
@@ -667,7 +722,6 @@ public class GeneralLegacyIndexCodes {
private static void writeExtraCodes(
int charOffset, byte[] bytes, byte extraCodeModifier,
ExtraCodesStream extraCodes)
- throws IOException
{
// we fill in a placeholder value for any chars w/out extra codes
int numChars = extraCodes.getNumChars();
@@ -712,7 +766,6 @@ public class GeneralLegacyIndexCodes {
*/
private static boolean trimExtraCodes(ByteStream extraCodes,
byte minTrimCode, byte maxTrimCode)
- throws IOException
{
if(extraCodes == null) {
return false;
@@ -730,7 +783,6 @@ public class GeneralLegacyIndexCodes {
private static void writeUnprintableCodes(
int charOffset, byte[] bytes, ByteStream unprintableCodes,
ExtraCodesStream extraCodes)
- throws IOException
{
// the offset seems to be calculated based on the number of bytes in the
// "extra codes" part of the entry (even if there are no extra codes bytes
@@ -764,7 +816,6 @@ public class GeneralLegacyIndexCodes {
* Encode the given crazy code bytes into the given byte stream.
*/
private static void writeCrazyCodes(ByteStream crazyCodes, ByteStream bout)
- throws IOException
{
// CRAZY_CODE_2 flags at the end are ignored, so ditch them
trimExtraCodes(crazyCodes, CRAZY_CODE_2, CRAZY_CODE_2);
diff --git a/src/test/data/V2010/testEmoticonsV2010.accdb b/src/test/data/V2010/testEmoticonsV2010.accdb
new file mode 100755
index 0000000..8b70b95
--- /dev/null
+++ b/src/test/data/V2010/testEmoticonsV2010.accdb
Binary files differ
diff --git a/src/test/java/com/healthmarketscience/jackcess/TestUtil.java b/src/test/java/com/healthmarketscience/jackcess/TestUtil.java
index fae1fa0..7626ee4 100644
--- a/src/test/java/com/healthmarketscience/jackcess/TestUtil.java
+++ b/src/test/java/com/healthmarketscience/jackcess/TestUtil.java
@@ -361,16 +361,25 @@ public class TestUtil
}
static void dumpIndex(Index index) throws Exception {
- dumpIndex(index, new PrintWriter(System.out, true));
+ dumpIndex(index, Integer.MAX_VALUE);
}
- static void dumpIndex(Index index, PrintWriter writer) throws Exception {
+ static void dumpIndex(Index index, int limit) throws Exception {
+ dumpIndex(index, new PrintWriter(System.out, true), limit);
+ }
+
+ static void dumpIndex(Index index, PrintWriter writer, int limit)
+ throws Exception {
writer.println("INDEX: " + index);
IndexData.EntryCursor ec = ((IndexImpl)index).cursor();
IndexData.Entry lastE = ec.getLastEntry();
IndexData.Entry e = null;
+ int count = 0;
while((e = ec.getNextEntry()) != lastE) {
writer.println(e);
+ if((count++) > limit) {
+ break;
+ }
}
}
diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java b/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java
index cfcca83..ce4bb52 100644
--- a/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java
+++ b/src/test/java/com/healthmarketscience/jackcess/impl/IndexCodesTest.java
@@ -64,7 +64,17 @@ public class IndexCodesTest extends TestCase {
public void testIndexCodes() throws Exception
{
- for (final TestDB testDB : TestDB.getSupportedForBasename(Basename.INDEX_CODES, true)) {
+ doTestDb(Basename.INDEX_CODES);
+ }
+
+ public void testEmoticons() throws Exception
+ {
+ doTestDb(Basename.EMOTICONS);
+ }
+
+ private static void doTestDb(Basename dbBaseName) throws Exception
+ {
+ for (final TestDB testDB : TestDB.getSupportedForBasename(dbBaseName, true)) {
Database db = openMem(testDB);
db.setDateTimeType(DateTimeType.DATE);
diff --git a/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java b/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java
index 0f6c889..2379891 100644
--- a/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java
+++ b/src/test/java/com/healthmarketscience/jackcess/impl/JetFormatTest.java
@@ -58,7 +58,8 @@ public class JetFormatTest extends TestCase {
CALC_FIELD("calcFieldTest"),
BINARY_INDEX("binIdxTest"),
OLD_DATES("oldDates"),
- EXT_DATE("extDateTest");
+ EXT_DATE("extDateTest"),
+ EMOTICONS("testEmoticons");
private final String _basename;