aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Ahlborn <jtahlborn@yahoo.com>2013-09-15 02:05:11 +0000
committerJames Ahlborn <jtahlborn@yahoo.com>2013-09-15 02:05:11 +0000
commit08c5914dc92523a8edb986a178d38abdb5f8db28 (patch)
tree240e6656539d79fba35236d1d6361837dfbb6c4c
parent012c3e7c30a0e09f788e8b43374549c07657ca5d (diff)
downloadjackcess-08c5914dc92523a8edb986a178d38abdb5f8db28.tar.gz
jackcess-08c5914dc92523a8edb986a178d38abdb5f8db28.zip
implement nicer ascii text conversion for ole package paths
git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@801 f203690c-595d-4dc9-a70b-905162fa7fd2
-rw-r--r--src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java17
1 files changed, 17 insertions, 0 deletions
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java b/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java
index b82ded2..4471da5 100644
--- a/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java
+++ b/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java
@@ -30,8 +30,10 @@ import java.nio.charset.Charset;
import java.sql.Blob;
import java.sql.SQLException;
import java.sql.SQLFeatureNotSupportedException;
+import java.text.Normalizer;
import java.util.EnumSet;
import java.util.Set;
+import java.util.regex.Pattern;
import com.healthmarketscience.jackcess.DataType;
import com.healthmarketscience.jackcess.util.OleBlob;
@@ -78,6 +80,10 @@ public class OleUtil
0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE
};
+ // regex pattern which matches all the crazy extra stuff in unicode
+ private static final Pattern UNICODE_ACCENT_PATTERN =
+ Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
+
private static final CompoundPackageFactory COMPOUND_FACTORY;
static {
@@ -433,6 +439,17 @@ public class OleUtil
}
private static byte[] getZeroTermStrBytes(String str) {
+ // since we are converting to ascii, try to make "nicer" versions of crazy
+ // chars (e.g. convert "u with an umlaut" to just "u"). this may not
+ // ultimately help anything but it is what ms access does.
+
+ // decompose complex chars into combos of char and accent
+ str = Normalizer.normalize(str, Normalizer.Form.NFD);
+ // strip the accents
+ str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll("");
+ // (re)normalize what is left
+ str = Normalizer.normalize(str, Normalizer.Form.NFC);
+
return (str + '\0').getBytes(OLE_CHARSET);
}