From: PJ Fanning Date: Tue, 30 Aug 2022 19:46:32 +0000 (+0000) Subject: [TIKA-3388] issue with non-ascii chars in file name of embedded OLE object X-Git-Tag: REL_5_2_3~17 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=d00be6e7db6fc1557e0ab2f28d0b57db81c3c4ba;p=poi.git [TIKA-3388] issue with non-ascii chars in file name of embedded OLE object git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1903780 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java b/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java index 7048bc13fe..cad6cd12f8 100644 --- a/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java +++ b/poi-ooxml/src/test/java/org/apache/poi/xwpf/TestXWPFBugs.java @@ -32,10 +32,14 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.poi.POIDataSamples; import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackagePartName; +import org.apache.poi.openxml4j.opc.PackagingURIHelper; import org.apache.poi.poifs.crypt.CipherAlgorithm; import org.apache.poi.poifs.crypt.Decryptor; import org.apache.poi.poifs.crypt.EncryptionInfo; import org.apache.poi.poifs.crypt.HashAlgorithm; +import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; @@ -159,4 +163,22 @@ class TestXWPFBugs { assertEquals(731, document.getParagraphs().size()); } } + + @Test + void tika3388() throws Exception { + try (XWPFDocument document = new XWPFDocument(samples.openResourceAsStream("tika-3388.docx"))) { + assertEquals(1, document.getParagraphs().size()); + PackagePartName partName = PackagingURIHelper.createPartName("/word/embeddings/oleObject1.bin"); + PackagePart part = document.getPackage().getPart(partName); + assertNotNull(part); + try ( + InputStream partStream = part.getInputStream(); + POIFSFileSystem poifs = new POIFSFileSystem(partStream) + ) { + Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(poifs); + assertEquals("C:\\Users\\ross\\AppData\\Local\\Microsoft\\Windows\\INetCache\\Content.Word\\約翰的測試文件\uD83D\uDD96.msg", + ole.getFileName()); + } + } + } } diff --git a/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java b/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java index 9b1545e3b0..681f4bd8ff 100644 --- a/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java +++ b/poi/src/main/java/org/apache/poi/poifs/filesystem/Ole10Native.java @@ -48,7 +48,7 @@ public class Ole10Native { public static final String OLE10_NATIVE = "\u0001Ole10Native"; - private static final Charset ISO1 = StandardCharsets.ISO_8859_1; + private static final Charset UTF8 = StandardCharsets.UTF_8; // arbitrarily selected; may need to increase private static final int DEFAULT_MAX_RECORD_LENGTH = 100_000_000; private static int MAX_RECORD_LENGTH = DEFAULT_MAX_RECORD_LENGTH; @@ -407,14 +407,14 @@ public class Ole10Native { // total size, will be determined later .. leos.writeShort(getFlags1()); - leos.write(getLabel().getBytes(ISO1)); + leos.write(getLabel().getBytes(UTF8)); leos.write(0); - leos.write(getFileName().getBytes(ISO1)); + leos.write(getFileName().getBytes(UTF8)); leos.write(0); leos.writeShort(getFlags2()); leos.writeShort(getUnknown1()); leos.writeInt(getCommand().length() + 1); - leos.write(getCommand().getBytes(ISO1)); + leos.write(getCommand().getBytes(UTF8)); leos.write(0); leos.writeInt(getDataSize()); leos.write(getDataBuffer()); diff --git a/poi/src/main/java/org/apache/poi/util/StringUtil.java b/poi/src/main/java/org/apache/poi/util/StringUtil.java index 190d8dc99c..69e45c7d50 100644 --- a/poi/src/main/java/org/apache/poi/util/StringUtil.java +++ b/poi/src/main/java/org/apache/poi/util/StringUtil.java @@ -135,13 +135,13 @@ public final class StringUtil { final int offset, final int len) { int len_to_use = Math.min(len, string.length - offset); - return new String(string, offset, len_to_use, ISO_8859_1); + return new String(string, offset, len_to_use, UTF8); } public static String readCompressedUnicode(LittleEndianInput in, int nChars) { byte[] buf = IOUtils.safelyAllocate(nChars, MAX_RECORD_LENGTH); in.readFully(buf); - return new String(buf, ISO_8859_1); + return new String(buf, UTF8); } /** diff --git a/test-data/document/tika-3388.docx b/test-data/document/tika-3388.docx new file mode 100644 index 0000000000..b884ea1ac6 Binary files /dev/null and b/test-data/document/tika-3388.docx differ