From 08c5914dc92523a8edb986a178d38abdb5f8db28 Mon Sep 17 00:00:00 2001 From: James Ahlborn Date: Sun, 15 Sep 2013 02:05:11 +0000 Subject: [PATCH] implement nicer ascii text conversion for ole package paths git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@801 f203690c-595d-4dc9-a70b-905162fa7fd2 --- .../jackcess/impl/OleUtil.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java b/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java index b82ded2..4471da5 100644 --- a/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java +++ b/src/main/java/com/healthmarketscience/jackcess/impl/OleUtil.java @@ -30,8 +30,10 @@ import java.nio.charset.Charset; import java.sql.Blob; import java.sql.SQLException; import java.sql.SQLFeatureNotSupportedException; +import java.text.Normalizer; import java.util.EnumSet; import java.util.Set; +import java.util.regex.Pattern; import com.healthmarketscience.jackcess.DataType; import com.healthmarketscience.jackcess.util.OleBlob; @@ -78,6 +80,10 @@ public class OleUtil 0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE }; + // regex pattern which matches all the crazy extra stuff in unicode + private static final Pattern UNICODE_ACCENT_PATTERN = + Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+"); + private static final CompoundPackageFactory COMPOUND_FACTORY; static { @@ -433,6 +439,17 @@ public class OleUtil } private static byte[] getZeroTermStrBytes(String str) { + // since we are converting to ascii, try to make "nicer" versions of crazy + // chars (e.g. convert "u with an umlaut" to just "u"). this may not + // ultimately help anything but it is what ms access does. + + // decompose complex chars into combos of char and accent + str = Normalizer.normalize(str, Normalizer.Form.NFD); + // strip the accents + str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll(""); + // (re)normalize what is left + str = Normalizer.normalize(str, Normalizer.Form.NFC); + return (str + '\0').getBytes(OLE_CHARSET); } -- 2.39.5