diff options
author | James Ahlborn <jtahlborn@yahoo.com> | 2013-09-15 02:05:11 +0000 |
---|---|---|
committer | James Ahlborn <jtahlborn@yahoo.com> | 2013-09-15 02:05:11 +0000 |
commit | 08c5914dc92523a8edb986a178d38abdb5f8db28 (patch) | |
tree | 240e6656539d79fba35236d1d6361837dfbb6c4c | |
parent | 012c3e7c30a0e09f788e8b43374549c07657ca5d (diff) | |
download | jackcess-08c5914dc92523a8edb986a178d38abdb5f8db28.tar.gz jackcess-08c5914dc92523a8edb986a178d38abdb5f8db28.zip |
implement nicer ascii text conversion for ole package paths
git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@801 f203690c-595d-4dc9-a70b-905162fa7fd2
-rw-r--r-- | src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java | 17 |
1 files changed, 17 insertions, 0 deletions
diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java b/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java index b82ded2..4471da5 100644 --- a/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java +++ b/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java @@ -30,8 +30,10 @@ import java.nio.charset.Charset; import java.sql.Blob; import java.sql.SQLException; import java.sql.SQLFeatureNotSupportedException; +import java.text.Normalizer; import java.util.EnumSet; import java.util.Set; +import java.util.regex.Pattern; import com.healthmarketscience.jackcess.DataType; import com.healthmarketscience.jackcess.util.OleBlob; @@ -78,6 +80,10 @@ public class OleUtil 0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE }; + // regex pattern which matches all the crazy extra stuff in unicode + private static final Pattern UNICODE_ACCENT_PATTERN = + Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+"); + private static final CompoundPackageFactory COMPOUND_FACTORY; static { @@ -433,6 +439,17 @@ public class OleUtil } private static byte[] getZeroTermStrBytes(String str) { + // since we are converting to ascii, try to make "nicer" versions of crazy + // chars (e.g. convert "u with an umlaut" to just "u"). this may not + // ultimately help anything but it is what ms access does. + + // decompose complex chars into combos of char and accent + str = Normalizer.normalize(str, Normalizer.Form.NFD); + // strip the accents + str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll(""); + // (re)normalize what is left + str = Normalizer.normalize(str, Normalizer.Form.NFC); + return (str + '\0').getBytes(OLE_CHARSET); } |