From f74a8f9abb6e794bc7e27bea8553c106d43c4037 Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Sun, 26 May 2019 09:43:59 +0000 Subject: [PATCH] =?utf8?q?[github-149]=20improve=20MAPIMessage.guess7BitEn?= =?utf8?q?coding,=20improve=20MAPIMessage.getHtmlBody.=20Thanks=20to=20Dom?= =?utf8?q?inik=20H=C3=B6lzl.=20This=20closes=20#149?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1860043 13f79535-47bb-0310-9956-ffa450edef68 --- src/java/org/apache/poi/util/LocaleUtil.java | 468 ++++++++++++++++++ .../src/org/apache/poi/hsmf/MAPIMessage.java | 196 +++++--- .../poi/hsmf/datatypes/MAPIProperty.java | 2 + .../org/apache/poi/hsmf/AllHSMFTests.java | 3 +- .../org/apache/poi/hsmf/Test7BitCodepage.java | 85 ++++ test-data/hsmf/ASCII_CP1251_LCID1049.msg | Bin 0 -> 3584 bytes .../hsmf/ASCII_UTF-8_CP1252_LCID1031.msg | Bin 0 -> 3584 bytes .../hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg | Bin 0 -> 3584 bytes test-data/hsmf/HTMLBodyBinary_CP1251.msg | Bin 0 -> 3584 bytes test-data/hsmf/HTMLBodyBinary_UTF-8.msg | Bin 0 -> 3584 bytes 10 files changed, 679 insertions(+), 75 deletions(-) create mode 100644 src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java create mode 100644 test-data/hsmf/ASCII_CP1251_LCID1049.msg create mode 100644 test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031.msg create mode 100644 test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg create mode 100644 test-data/hsmf/HTMLBodyBinary_CP1251.msg create mode 100644 test-data/hsmf/HTMLBodyBinary_UTF-8.msg diff --git a/src/java/org/apache/poi/util/LocaleUtil.java b/src/java/org/apache/poi/util/LocaleUtil.java index 70d9a50750..c440fc96a4 100644 --- a/src/java/org/apache/poi/util/LocaleUtil.java +++ b/src/java/org/apache/poi/util/LocaleUtil.java @@ -616,5 +616,473 @@ public final class LocaleUtil { } } + /** + * Get default code page from LCID value + * + * @param lcid the LCID value + * @return the default code page + */ + public static int getDefaultCodePageFromLCID(int lcid) { + int languageId = lcid & 0xFFFF; + switch (languageId) { + case 0x0001: return 1256; + case 0x0002: return 1251; + case 0x0003: return 1252; + case 0x0004: return 936; + case 0x0005: return 1250; + case 0x0006: return 1252; + case 0x0007: return 1252; + case 0x0008: return 1253; + case 0x0009: return 1252; + case 0x000a: return 1252; + case 0x000b: return 1252; + case 0x000c: return 1252; + case 0x000d: return 1255; + case 0x000e: return 1250; + case 0x000f: return 1252; + case 0x0010: return 1252; + case 0x0011: return 932; + case 0x0012: return 949; + case 0x0013: return 1252; + case 0x0014: return 1252; + case 0x0015: return 1250; + case 0x0016: return 1252; + case 0x0017: return 1252; + case 0x0018: return 1250; + case 0x0019: return 1251; + case 0x001a: return 1250; + case 0x001b: return 1250; + case 0x001c: return 1250; + case 0x001d: return 1252; + case 0x001e: return 874; + case 0x001f: return 1254; + case 0x0020: return 1256; + case 0x0021: return 1252; + case 0x0022: return 1251; + case 0x0023: return 1251; + case 0x0024: return 1250; + case 0x0025: return 1257; + case 0x0026: return 1257; + case 0x0027: return 1257; + case 0x0028: return 1251; + case 0x0029: return 1256; + case 0x002a: return 1258; + case 0x002b: return 0; + case 0x002c: return 1254; + case 0x002d: return 1252; + case 0x002e: return 1252; + case 0x002f: return 1251; + case 0x0030: return 0; + case 0x0031: return 0; + case 0x0032: return 1252; + case 0x0033: return 32759; + case 0x0034: return 1252; + case 0x0035: return 1252; + case 0x0036: return 1252; + case 0x0037: return 0; + case 0x0038: return 1252; + case 0x0039: return 0; + case 0x003a: return 0; + case 0x003b: return 1252; + case 0x003c: return 1252; + case 0x003d: return 32759; + case 0x003e: return 1252; + case 0x003f: return 0; + case 0x0040: return 1251; + case 0x0041: return 1252; + case 0x0042: return 1250; + case 0x0043: return 1254; + case 0x0044: return 1251; + case 0x0045: return 0; + case 0x0046: return 0; + case 0x0047: return 0; + case 0x0048: return 0; + case 0x0049: return 0; + case 0x004a: return 0; + case 0x004b: return 0; + case 0x004c: return 0; + case 0x004d: return 0; + case 0x004e: return 0; + case 0x004f: return 0; + case 0x0050: return 1251; + case 0x0051: return 0; + case 0x0052: return 1252; + case 0x0053: return 0; + case 0x0054: return 0; + case 0x0055: return 0; + case 0x0056: return 1252; + case 0x0057: return 0; + case 0x0058: return 32759; + case 0x0059: return 1256; + case 0x005a: return 0; + case 0x005b: return 0; + case 0x005c: return 0; + case 0x005d: return 1252; + case 0x005e: return 0; + case 0x005f: return 1252; + case 0x0060: return 32759; + case 0x0061: return 0; + case 0x0062: return 1252; + case 0x0063: return 0; + case 0x0064: return 1252; + case 0x0065: return 0; + case 0x0066: return 32759; + case 0x0067: return 1252; + case 0x0068: return 1252; + case 0x0069: return 32759; + case 0x006a: return 1252; + case 0x006b: return 1252; + case 0x006c: return 1252; + case 0x006d: return 1251; + case 0x006e: return 1252; + case 0x006f: return 1252; + case 0x0070: return 1252; + case 0x0071: return 32759; + case 0x0072: return 0; + case 0x0073: return 0; + case 0x0074: return 1252; + case 0x0075: return 1252; + case 0x0076: return 32759; + case 0x0077: return 0; + case 0x0078: return 0; + case 0x0079: return 32759; + case 0x007a: return 1252; + case 0x007b: return 32759; + case 0x007c: return 1252; + case 0x007d: return 32759; + case 0x007e: return 1252; + case 0x007f: return 1252; + case 0x0080: return 1256; + case 0x0081: return 0; + case 0x0082: return 1252; + case 0x0083: return 1252; + case 0x0084: return 1252; + case 0x0085: return 1251; + case 0x0086: return 1252; + case 0x0087: return 1252; + case 0x0088: return 1252; + case 0x0089: return 32759; + case 0x008a: return 32759; + case 0x008b: return 32759; + case 0x008c: return 1256; + case 0x008d: return 32759; + case 0x008e: return 32759; + case 0x008f: return 32759; + case 0x0090: return 32759; + case 0x0091: return 1252; + case 0x0092: return 1256; + case 0x0093: return 32759; + case 0x0401: return 1256; + case 0x0402: return 1251; + case 0x0403: return 1252; + case 0x0404: return 950; + case 0x0405: return 1250; + case 0x0406: return 1252; + case 0x0407: return 1252; + case 0x0408: return 1253; + case 0x0409: return 1252; + case 0x040a: return 1252; + case 0x040b: return 1252; + case 0x040c: return 1252; + case 0x040d: return 1255; + case 0x040e: return 1250; + case 0x040f: return 1252; + case 0x0410: return 1252; + case 0x0411: return 932; + case 0x0412: return 949; + case 0x0413: return 1252; + case 0x0414: return 1252; + case 0x0415: return 1250; + case 0x0416: return 1252; + case 0x0417: return 1252; + case 0x0418: return 1250; + case 0x0419: return 1251; + case 0x041a: return 1250; + case 0x041b: return 1250; + case 0x041c: return 1250; + case 0x041d: return 1252; + case 0x041e: return 874; + case 0x041f: return 1254; + case 0x0420: return 1256; + case 0x0421: return 1252; + case 0x0422: return 1251; + case 0x0423: return 1251; + case 0x0424: return 1250; + case 0x0425: return 1257; + case 0x0426: return 1257; + case 0x0427: return 1257; + case 0x0428: return 1251; + case 0x0429: return 1256; + case 0x042a: return 1258; + case 0x042b: return 0; + case 0x042c: return 1254; + case 0x042d: return 1252; + case 0x042e: return 1252; + case 0x042f: return 1251; + case 0x0430: return 0; + case 0x0431: return 0; + case 0x0432: return 1252; + case 0x0433: return 32759; + case 0x0434: return 1252; + case 0x0435: return 1252; + case 0x0436: return 1252; + case 0x0437: return 0; + case 0x0438: return 1252; + case 0x0439: return 0; + case 0x043a: return 0; + case 0x043b: return 1252; + case 0x043d: return 32759; + case 0x043e: return 1252; + case 0x043f: return 0; + case 0x0440: return 1251; + case 0x0441: return 1252; + case 0x0442: return 1250; + case 0x0443: return 1254; + case 0x0444: return 1251; + case 0x0445: return 0; + case 0x0446: return 0; + case 0x0447: return 0; + case 0x0448: return 0; + case 0x0449: return 0; + case 0x044a: return 0; + case 0x044b: return 0; + case 0x044c: return 0; + case 0x044d: return 0; + case 0x044e: return 0; + case 0x044f: return 0; + case 0x0450: return 1251; + case 0x0451: return 0; + case 0x0452: return 1252; + case 0x0453: return 0; + case 0x0454: return 0; + case 0x0455: return 0; + case 0x0456: return 1252; + case 0x0457: return 0; + case 0x0458: return 32759; + case 0x0459: return 32759; + case 0x045a: return 0; + case 0x045b: return 0; + case 0x045c: return 0; + case 0x045d: return 0; + case 0x045e: return 0; + case 0x045f: return 32759; + case 0x0460: return 32759; + case 0x0461: return 0; + case 0x0462: return 1252; + case 0x0463: return 0; + case 0x0464: return 1252; + case 0x0465: return 0; + case 0x0466: return 32759; + case 0x0467: return 32759; + case 0x0468: return 1252; + case 0x0469: return 32759; + case 0x046a: return 1252; + case 0x046b: return 1252; + case 0x046c: return 1252; + case 0x046d: return 1251; + case 0x046e: return 1252; + case 0x046f: return 1252; + case 0x0470: return 1252; + case 0x0471: return 32759; + case 0x0472: return 0; + case 0x0473: return 0; + case 0x0474: return 1252; + case 0x0475: return 1252; + case 0x0476: return 32759; + case 0x0477: return 0; + case 0x0478: return 0; + case 0x0479: return 32759; + case 0x047a: return 1252; + case 0x047c: return 1252; + case 0x047e: return 1252; + case 0x0480: return 1256; + case 0x0481: return 0; + case 0x0482: return 1252; + case 0x0483: return 1252; + case 0x0484: return 1252; + case 0x0485: return 1251; + case 0x0486: return 1252; + case 0x0487: return 1252; + case 0x0488: return 1252; + case 0x048c: return 1256; + case 0x048d: return 32759; + case 0x048e: return 32759; + case 0x048f: return 32759; + case 0x0490: return 32759; + case 0x0491: return 1252; + case 0x0492: return 1256; + case 0x0493: return 32759; + case 0x0501: return 1250; + case 0x05fe: return 932; + case 0x0801: return 1256; + case 0x0803: return 1252; + case 0x0804: return 936; + case 0x0807: return 1252; + case 0x0809: return 1252; + case 0x080a: return 1252; + case 0x080c: return 1252; + case 0x0810: return 1252; + case 0x0811: return 32759; + case 0x0813: return 1252; + case 0x0814: return 1252; + case 0x0816: return 1252; + case 0x0818: return 0; + case 0x0819: return 32759; + case 0x081a: return 1250; + case 0x081d: return 1252; + case 0x0820: return 0; + case 0x0827: return 32759; + case 0x082c: return 1251; + case 0x082e: return 1252; + case 0x0832: return 1252; + case 0x083b: return 1252; + case 0x083c: return 1252; + case 0x083e: return 1252; + case 0x0843: return 1251; + case 0x0845: return 0; + case 0x0846: return 1256; + case 0x0849: return 0; + case 0x0850: return 0; + case 0x0851: return 32759; + case 0x0859: return 1256; + case 0x085d: return 1252; + case 0x085f: return 1252; + case 0x0860: return 32759; + case 0x0861: return 0; + case 0x0867: return 1252; + case 0x086b: return 1252; + case 0x0873: return 0; + case 0x09ff: return 1256; + case 0x0c01: return 1256; + case 0x0c04: return 950; + case 0x0c07: return 1252; + case 0x0c09: return 1252; + case 0x0c0a: return 1252; + case 0x0c0c: return 1252; + case 0x0c1a: return 1251; + case 0x0c3b: return 1252; + case 0x0c5f: return 32759; + case 0x0c6b: return 1252; + case 0x1001: return 1256; + case 0x1004: return 936; + case 0x1007: return 1252; + case 0x1009: return 1252; + case 0x100a: return 1252; + case 0x100c: return 1252; + case 0x101a: return 1250; + case 0x103b: return 1252; + case 0x1401: return 1256; + case 0x1404: return 950; + case 0x1407: return 1252; + case 0x1409: return 1252; + case 0x140a: return 1252; + case 0x140c: return 1252; + case 0x141a: return 1250; + case 0x143b: return 1252; + case 0x1801: return 1256; + case 0x1809: return 1252; + case 0x180a: return 1252; + case 0x180c: return 1252; + case 0x181a: return 1250; + case 0x183b: return 1252; + case 0x1c01: return 1256; + case 0x1c09: return 1252; + case 0x1c0a: return 1252; + case 0x1c0c: return 32759; + case 0x1c1a: return 1251; + case 0x1c3b: return 1252; + case 0x2001: return 1256; + case 0x2008: return 32759; + case 0x2009: return 1252; + case 0x200a: return 1252; + case 0x200c: return 0; + case 0x201a: return 1251; + case 0x203b: return 1252; + case 0x2401: return 1256; + case 0x2409: return 1252; + case 0x240a: return 1252; + case 0x240c: return 0; + case 0x241a: return 1250; + case 0x243b: return 1252; + case 0x2801: return 1256; + case 0x2809: return 1252; + case 0x280a: return 1252; + case 0x280c: return 0; + case 0x281a: return 1251; + case 0x2c01: return 1256; + case 0x2c09: return 1252; + case 0x2c0a: return 1252; + case 0x2c0c: return 0; + case 0x2c1a: return 1250; + case 0x3001: return 1256; + case 0x3009: return 1252; + case 0x300a: return 1252; + case 0x300c: return 0; + case 0x301a: return 1251; + case 0x3401: return 1256; + case 0x3409: return 1252; + case 0x340a: return 1252; + case 0x340c: return 0; + case 0x3801: return 1256; + case 0x3809: return 32759; + case 0x380a: return 1252; + case 0x380c: return 0; + case 0x3c01: return 1256; + case 0x3c09: return 0; + case 0x3c0a: return 1252; + case 0x3c0c: return 0; + case 0x4001: return 1256; + case 0x4009: return 1252; + case 0x400a: return 1252; + case 0x4401: return 32759; + case 0x4409: return 1252; + case 0x440a: return 1252; + case 0x4801: return 32759; + case 0x4809: return 1252; + case 0x480a: return 1252; + case 0x4c09: return 32759; + case 0x4c0a: return 1252; + case 0x5009: return 32759; + case 0x500a: return 1252; + case 0x5409: return 32759; + case 0x540a: return 1252; + case 0x5809: return 32759; + case 0x5c09: return 32759; + case 0x6009: return 32759; + case 0x6409: return 32759; + case 0x641a: return 1251; + case 0x681a: return 1250; + case 0x6c1a: return 1251; + case 0x701a: return 1250; + case 0x703b: return 1252; + case 0x742c: return 1251; + case 0x743b: return 1252; + case 0x7804: return 936; + case 0x7814: return 1252; + case 0x781a: return 1250; + case 0x782c: return 1254; + case 0x783b: return 1252; + case 0x7843: return 1251; + case 0x7850: return 1251; + case 0x785d: return 0; + case 0x7c04: return 950; + case 0x7c14: return 1252; + case 0x7c1a: return 1250; + case 0x7c28: return 1251; + case 0x7c2e: return 1252; + case 0x7c3b: return 1252; + case 0x7c43: return 1254; + case 0x7c46: return 1256; + case 0x7c50: return 0; + case 0x7c59: return 1256; + case 0x7c5c: return 0; + case 0x7c5d: return 1252; + case 0x7c5f: return 1252; + case 0x7c67: return 1252; + case 0x7c68: return 1252; + case 0x7c92: return 1256; + default: return 0; + } + } } diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java index 5b0941e4b3..ef2771b9fe 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java @@ -50,6 +50,7 @@ import org.apache.poi.hsmf.parsers.POIFSChunkParser; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.CodePageUtil; +import org.apache.poi.util.LocaleUtil; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; @@ -210,8 +211,21 @@ public class MAPIMessage extends POIReadOnlyDocument { * returnNullOnMissingChunk is set */ public String getHtmlBody() throws ChunkNotFoundException { - if(mainChunks.getHtmlBodyChunkBinary() != null) { - return mainChunks.getHtmlBodyChunkBinary().getAs7bitString(); + ByteChunk htmlBodyBinaryChunk = mainChunks.getHtmlBodyChunkBinary(); + if (htmlBodyBinaryChunk != null) { + List cpid = mainChunks.getProperties().get(MAPIProperty.INTERNET_CPID); + if (cpid != null && cpid.size() > 0) { + int codepage = ((LongPropertyValue) cpid.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + byte[] htmlBodyBinary = htmlBodyBinaryChunk.getValue(); + return new String(htmlBodyBinary, encoding); + } catch (UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "HTML body binary: Invalid codepage ID ", codepage, " set for the message via ", + MAPIProperty.INTERNET_CPID, ", ignoring"); + } + } + return htmlBodyBinaryChunk.getAs7bitString(); } return getStringFromChunk(mainChunks.getHtmlBodyChunkString()); } @@ -391,67 +405,86 @@ public class MAPIMessage extends POIReadOnlyDocument { *

Bug #49441 has more on why this is needed

*/ public void guess7BitEncoding() { - // First choice is a codepage property - for (MAPIProperty prop : new MAPIProperty[] { - MAPIProperty.MESSAGE_CODEPAGE, - MAPIProperty.INTERNET_CPID - }) { - List val = mainChunks.getProperties().get(prop); - if (val != null && val.size() > 0) { - int codepage = ((LongPropertyValue)val.get(0)).getValue(); - try { - String encoding = CodePageUtil.codepageToEncoding(codepage, true); - set7BitEncoding(encoding); - return; - } catch(UnsupportedEncodingException e) { - logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, - " set for the message via ", prop, ", ignoring"); - } + String generalcodepage = null; + String htmlbodycodepage = null; + String bodycodepage = null; + // + // General codepage: Message codepage property. + // + List val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_CODEPAGE); + if (val != null && val.size() > 0) { + int codepage = ((LongPropertyValue) val.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + generalcodepage = encoding; + } catch (UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ", + MAPIProperty.MESSAGE_CODEPAGE, ", ignoring"); + } + } + // + // General codepage fallback: Message locale ID property. + // + if (generalcodepage == null) { + val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_LOCALE_ID); + if (val != null && val.size() > 0) { + int lcid = ((LongPropertyValue) val.get(0)).getValue(); + int codepage = LocaleUtil.getDefaultCodePageFromLCID(lcid); + try { + if (codepage != 0) { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + generalcodepage = encoding; + } + } catch (UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, "from locale ID", lcid, " set for the message via ", + MAPIProperty.MESSAGE_LOCALE_ID, ", ignoring"); } - } - - - // Second choice is a charset on a content type header - try { + } + } + // + // General codepage fallback: Charset on a content type header. + // + if (generalcodepage == null) { + try { String[] headers = getHeaders(); - if(headers != null && headers.length > 0) { - // Look for a content type with a charset - Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); - - for(String header : headers) { - if(header.startsWith("Content-Type")) { - Matcher m = p.matcher(header); - if(m.matches()) { - // Found it! Tell all the string chunks - String charset = m.group(1); - - if (!charset.equalsIgnoreCase("utf-8")) { - set7BitEncoding(charset); - } - return; - } + if (headers != null && headers.length > 0) { + Pattern p = Pattern.compile("content-type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); + for (String header : headers) { + if (header.toLowerCase().startsWith("content-type")) { + Matcher m = p.matcher(header); + if (m.matches()) { + String encoding = m.group(1); + generalcodepage = encoding; } - } + } + } } - } catch(ChunkNotFoundException e) {} - - // Nothing suitable in the headers, try HTML - try { - String html = getHtmlBody(); - if(html != null && html.length() > 0) { - // Look for a content type in the meta headers - Pattern p = Pattern.compile( - " 0) { + int codepage = ((LongPropertyValue) val.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + htmlbodycodepage = encoding; + if (!encoding.equalsIgnoreCase("utf-8")) { + bodycodepage = encoding; } - } catch(ChunkNotFoundException e) {} - } + } catch (UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ", + MAPIProperty.INTERNET_CPID, ", ignoring"); + } + } + // + // Apply encoding + // + set7BitEncoding(generalcodepage, htmlbodycodepage, bodycodepage); + } /** * Many messages store their strings as unicode, which is @@ -464,26 +497,41 @@ public class MAPIMessage extends POIReadOnlyDocument { * @see #guess7BitEncoding() */ public void set7BitEncoding(String charset) { + set7BitEncoding(charset, charset, charset); + } + public void set7BitEncoding(String generalcharset, String htmlbodycharset, String bodycharset) { for(Chunk c : mainChunks.getChunks()) { if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); + if (c.getChunkId() == MAPIProperty.BODY_HTML.id) { + if (htmlbodycharset != null) { + ((StringChunk)c).set7BitEncoding(htmlbodycharset); + } + } + else if (c.getChunkId() == MAPIProperty.BODY.id) { + if (bodycharset != null) { + ((StringChunk)c).set7BitEncoding(bodycharset); + } + } + else if (generalcharset != null) { + ((StringChunk)c).set7BitEncoding(generalcharset); + } } } - - if (nameIdChunks!=null) { - for(Chunk c : nameIdChunks.getChunks()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); - } - } - } - - for(RecipientChunks rc : recipientChunks) { - for(Chunk c : rc.getAll()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); - } - } + if (generalcharset != null) { + if (nameIdChunks!=null) { + for(Chunk c : nameIdChunks.getChunks()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(generalcharset); + } + } + } + for(RecipientChunks rc : recipientChunks) { + for(Chunk c : rc.getAll()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(generalcharset); + } + } + } } } diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java index b7a0aed574..ba54f70101 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java @@ -512,6 +512,8 @@ public class MAPIProperty { new MAPIProperty(0x1a, ASCII_STRING, "MessageClass", "PR_MESSAGE_CLASS"); public static final MAPIProperty MESSAGE_CODEPAGE = new MAPIProperty(0x3ffd, Types.LONG, "MessageCodepage", "PR_MESSAGE_CODEPAGE"); + public static final MAPIProperty MESSAGE_LOCALE_ID = + new MAPIProperty(0x3ff1, Types.LONG, "MessageLocaleId", "PR_MESSAGE_LOCALE_ID"); public static final MAPIProperty MESSAGE_DELIVERY_ID = new MAPIProperty(0x1b, BINARY, "MessageDeliveryId", "PR_MESSAGE_DELIVERY_ID"); public static final MAPIProperty MESSAGE_DELIVERY_TIME = diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java b/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java index 77dfb5e4cb..d0d057c28f 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java @@ -39,7 +39,8 @@ import org.junit.runners.Suite; TestPOIFSChunkParser.class, TestMessageSubmissionChunkY2KRead.class, TestMessageSubmissionChunk.class, - TestExtractEmbeddedMSG.class + TestExtractEmbeddedMSG.class, + Test7BitCodepage.class }) public class AllHSMFTests { } diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java b/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java new file mode 100644 index 0000000000..0ec5eb7b56 --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java @@ -0,0 +1,85 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hsmf; + +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.poi.POIDataSamples; + +/** + * Tests to verify if code page for general properties like subject, + * text body and html body is evaluated correctly. + */ +public final class Test7BitCodepage extends TestCase { + private final MAPIMessage ascii_cp1251_lcid1049; + private final MAPIMessage ascii_utf_8_cp1252_lcid1031; + private final MAPIMessage ascii_utf_8_cp1252_lcid1031_html; + private final MAPIMessage htmlbodybinary_cp1251; + private final MAPIMessage htmlbodybinary_utf_8; + + /** + * Initialize this test, load up the messages. + * @throws Exception + */ + public Test7BitCodepage() throws IOException { + POIDataSamples samples = POIDataSamples.getHSMFInstance(); + ascii_cp1251_lcid1049 = new MAPIMessage(samples.openResourceAsStream("ASCII_CP1251_LCID1049.msg")); + ascii_utf_8_cp1252_lcid1031 = new MAPIMessage(samples.openResourceAsStream("ASCII_UTF-8_CP1252_LCID1031.msg")); + ascii_utf_8_cp1252_lcid1031_html = new MAPIMessage(samples.openResourceAsStream("ASCII_UTF-8_CP1252_LCID1031_HTML.msg")); + htmlbodybinary_cp1251 = new MAPIMessage(samples.openResourceAsStream("HTMLBodyBinary_CP1251.msg")); + htmlbodybinary_utf_8 = new MAPIMessage(samples.openResourceAsStream("HTMLBodyBinary_UTF-8.msg")); + } + + /** + * Evaluate encoding and check if the subject, text body and html body is decoded correctly. + */ + public void test7BitEncoding() throws Exception { + ascii_cp1251_lcid1049.guess7BitEncoding(); + ascii_cp1251_lcid1049.setReturnNullOnMissingChunk(true); + ascii_utf_8_cp1252_lcid1031.guess7BitEncoding(); + ascii_utf_8_cp1252_lcid1031.setReturnNullOnMissingChunk(true); + ascii_utf_8_cp1252_lcid1031_html.guess7BitEncoding(); + ascii_utf_8_cp1252_lcid1031_html.setReturnNullOnMissingChunk(true); + htmlbodybinary_cp1251.guess7BitEncoding(); + htmlbodybinary_cp1251.setReturnNullOnMissingChunk(true); + htmlbodybinary_utf_8.guess7BitEncoding(); + htmlbodybinary_utf_8.setReturnNullOnMissingChunk(true); + + assertEquals("Subject автоматически Subject", ascii_cp1251_lcid1049.getSubject()); + assertEquals("Body автоматически Body", ascii_cp1251_lcid1049.getTextBody()); + assertEquals("HTML автоматически", ascii_cp1251_lcid1049.getHtmlBody()); + + assertEquals("Subject öäü Subject", ascii_utf_8_cp1252_lcid1031.getSubject()); + assertEquals("Body öäü Body", ascii_utf_8_cp1252_lcid1031.getTextBody()); + assertNull(ascii_utf_8_cp1252_lcid1031.getHtmlBody()); + + assertEquals("Subject öäü Subject", ascii_utf_8_cp1252_lcid1031_html.getSubject()); + assertEquals("Body öäü Body", ascii_utf_8_cp1252_lcid1031_html.getTextBody()); + assertEquals("HTML öäü", ascii_utf_8_cp1252_lcid1031_html.getHtmlBody()); + + assertEquals("Subject öäü Subject", htmlbodybinary_cp1251.getSubject()); + assertNull(htmlbodybinary_cp1251.getTextBody()); + assertEquals("HTML автоматически", htmlbodybinary_cp1251.getHtmlBody()); + + assertEquals("Subject öäü Subject", htmlbodybinary_utf_8.getSubject()); + assertNull(htmlbodybinary_utf_8.getTextBody()); + assertEquals("HTML öäü", htmlbodybinary_utf_8.getHtmlBody()); + } +} diff --git a/test-data/hsmf/ASCII_CP1251_LCID1049.msg b/test-data/hsmf/ASCII_CP1251_LCID1049.msg new file mode 100644 index 0000000000000000000000000000000000000000..2936aa2593852f878377d8708b9f5ed2c079f7a5 GIT binary patch literal 3584 zcmeHJTWb?R6#mk+)*?!!;H6?%`y_&Gx2^c#n&PFPV6|vp)Iv#`#a2vKlC7mbr~jb- z68j4zDnbzv+83W}Jm2hW*oK|$&^JlWFqd=Y%zQIvF6ZoizR=qKH1-u;bQcZ|+rv1W zk{42Ki~s||Q+C*Hx7o$EH2d@)S>OdiNfBJkqb_v=TRci@x`@F-vfW|JQ4aNZd4yIZ z)lo(b0oG9!?xVCCXbKlfkB13N#j2U*4HP=~YTisOvyk{rY@i}#gtr=RMKGDBC3~ti z&2SR~9emosq?eW_?}f5_{242Jn(~z7li+jQUm3_XF%JsQx=+1f0sh&%Ki%h_@V#M$ z-rc1feJ(jTHyg(Q@rUhT=)8^z&TwE6cT|i})U%w$Rn^aXMRM?Ej{4vE(FheWIzs-9 z;>eGw@rmznu&?y5bo8P0FKhkolX|&Pc^9lkZtL5@?_aHhy+1$pf9|-YNUay9m&0F5|ZEtzJTw(iwhIa!YIh-vJyRzr5s&_OOe%BD&_m1Su6jO2GhrsTk&s66o~&p z{Cy}T{eC2VKM-GR8wX6C!2c3^M=v06|1)y883lL}8<~jypAR9ifk`hdXPk0P3rzTo zIy}(-ACFIL`fb)}98;T5>$UDQ;m5f5t?<)vzX>xf=lfTmhtwu~?mtP#*`MjOZWjzzHlVHVW9sjqoNqJW6d^MIz>z>%S{u3m7tZuP##+Rlz;OH5#}? zU8Fl{HPI5uk`WU{ltLC;M{I!4b;l**x6npSzBVotUR5wzHl>;_4_bEv@c};fAxO{P z@A(V+!tu*m`1Hd$sgvMyURNgapI8$jlSecgJ3u$&{qfG<cms`tLe# z2_!2>X4E^sKsk>sH%C~ZAhd`zcU(R1qW`r+{Uje6Ap@_j{MqfxdF0&rslF)dAxr&? zl1H7;X}fmSIJXPmA3p`v9yMiX{NVVocx2g)q0TXcJ*(bv!PyvNo~NqJoEi&{KNYR` z@UmRW)@&dZWKg}|4A97 zGw9mSJY`ue7=~|RR|}uMMr@K=Hk8p7UFQ`ZgwL$UE2HcuF7pOmeCnp9Ml}5$_ysmr B26+Gg literal 0 HcmV?d00001 diff --git a/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg b/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg new file mode 100644 index 0000000000000000000000000000000000000000..a51844b12f4ba0f9067389cc6db4b1b6943baf58 GIT binary patch literal 3584 zcmeHJ%}N|W5dPNDtRjlR;K6`QJV_wxM!|sNhL{KuTqB8>fFQd%M#Xhxw^#6S1Ro%| z*3}V*uz7p0dkEqron=rP-$cXb0XSkfh*Z0sB%{aKxibO+)C)M9b|-Y2{Flmq%z_ z(mskP!^aLbg*%j11y$h!>B(UnlcDOZu)Ic&(FWEY2WKGy(e$pyn)gb<=28CbYGKK(f32U?PrV5(@g}s{y?~AK z;!0tBIZ%ExwzQd-8R^Bd(ZnFh{%1Os|B1tzV@j;}w?qoWKM;Q(NJ+o1#P9p!i_^vd zlTG+vVt+OhNLv4t+-*hyIf;#IME}o+5bc3YFD_@CGH3)gd`2BP(Es0!Pi*>a)^QwL wn@?-WblUJk+*hga<8i+YGcM=**Pn;fHhiu>N_Xu)$ID7iyMF5SNp2Hx1&1zBLjV8( literal 0 HcmV?d00001 diff --git a/test-data/hsmf/HTMLBodyBinary_CP1251.msg b/test-data/hsmf/HTMLBodyBinary_CP1251.msg new file mode 100644 index 0000000000000000000000000000000000000000..13967a31e7fe66ca4d07af9083c6e35b26981eec GIT binary patch literal 3584 zcmeHJ-Add*6#mxKZlzc(iWe3n^-2VPqJo0ymSP19YDK*%2+OXfRdJ2%&I-Ow`v`r5 zzCrs0WvvQQil7(mjdeX=b|RygnDwGi+!HeAZ)Uz^X3m`R?d#LZr_Wt`xJDDOa9(Y} zL&KgjWV8WIqSJO>tybxxEMkNHqZL?yFOPzaDQpQZVwYQ)m^#s%3V+u;D{&K8;{2iy zQx;jl&7lAf>sS?SN2DT3qWR)+Fo2<$$$F8x#OJ*A67fqYBQLGO2aT5%Og_W%IsuQG zw+~I1_*{pmex^N)7uw?SM-t&P4j;&S7krNUD+BpWe49llZ_xc%0fr&ZPq_c&jN`@` zxF7p3TOCUlT1RX5dhK?#kba`?`9~bsjj^Cc=+Lt~#;+4Zn;g{+t`0m>$mN~VmZIzttk8V!I`b)+lE~6znPyMnDdo)#aM^pK7ajo za_Qatl)a`3n;Ey(7Ce=+SJraHlBZrQ_mtK9{_*9WjGOmYcQeyVvom(($LaB}%IVRc z!;_y!?qJ|>2ZPadpeM^WdMR&Y@%d*oQT{PM_(qUxM&2!}&$Kg=+O;7L-@&Uy_>8r14@{gW lnV}a1uka{*Ry_`q)gQ!W-9R3NY1-L0jBnH4kpKM(Tmb6g81(=E literal 0 HcmV?d00001 diff --git a/test-data/hsmf/HTMLBodyBinary_UTF-8.msg b/test-data/hsmf/HTMLBodyBinary_UTF-8.msg new file mode 100644 index 0000000000000000000000000000000000000000..34bdac82f3bf02473458da6a5abf114ad2a06f31 GIT binary patch literal 3584 zcmeHJPfHs?6#vDoO{KJ06b}|;o0ACs*&49m8YwleE^x|~?aQF;&Xch); zntga2(x*Z^`hgyisk>=5o9rSlY?uC{9ry%S9t9J5>_}V19*;6H4WTy`{BHQJ#7$sG z=Wl(PiYN+h2^Bc_iVcx=g{z__k}EwHCNLE<*(g#ie9qf05x<5y%JNnCqVbA?$>+7a zme)qjJBFSXKKCKYF02pyg}QkB=|uSS!$qXNzC40qZ+xFGN;Dnlr+q=o_npE@z5LBtSLWrPD^v6L4Y}3-VR>=lqpO@-$~FZ1 z)xy%d)z8a$b4yjWbGHAjII3i>ZS}ZR%~~dBm)(uM-22tVg4y`lIB5K` zCq0cl=?!ZGwq>ffspBR-|6~*8pVnf8NwN^emcWkq-zDF>@@3vvlJ_0S#YyXcsSffl zv47|V#Qp!Y+-+6?mefX3#QdL@PC5hkA<8bSrzZ_)MI63`w~6rSYs4mnaiVy-qUXHQ cM&YySv5>6(ATH~Mr1-QA{9D*A{lgvj3vqiHR{#J2 literal 0 HcmV?d00001 -- 2.39.5