From: Dominik Stadler Date: Sun, 22 Mar 2015 13:33:43 +0000 (+0000) Subject: Bug 47304: use fixed encoding when extracting text in WordDocument X-Git-Tag: REL_3_12_FINAL~68 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=0850e7d846a71cbd90f919f841917a1cfedad006;p=poi.git Bug 47304: use fixed encoding when extracting text in WordDocument git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1668367 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java b/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java index 1b6d4646c7..5f24337fbb 100644 --- a/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java @@ -18,12 +18,21 @@ package org.apache.poi.stress; import static org.junit.Assert.assertNotNull; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.FileInputStream; +import java.io.IOException; import java.io.InputStream; +import java.io.PrintWriter; +import java.io.StringWriter; +import org.apache.poi.hdf.extractor.WordDocument; import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.extractor.WordExtractor; import org.junit.Test; +@SuppressWarnings("deprecation") public class HWPFFileHandler extends POIFSFileHandler { @Override public void handleFile(InputStream stream) throws Exception { @@ -33,16 +42,53 @@ public class HWPFFileHandler extends POIFSFileHandler { assertNotNull(doc.getEndnotes()); handlePOIDocument(doc); + + // fails for many documents, but is deprecated anyway... + // handleWordDocument(doc); + } + + protected void handleWordDocument(HWPFDocument doc) throws IOException { + ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + doc.write(outStream); + + WordDocument wordDoc = new WordDocument(new ByteArrayInputStream(outStream.toByteArray())); + + StringWriter docTextWriter = new StringWriter(); + PrintWriter out = new PrintWriter(docTextWriter); + try { + wordDoc.writeAllText(out); + } finally { + out.close(); + } + docTextWriter.close(); } + + // a test-case to test this locally without executing the full TestAllFiles @Test public void test() throws Exception { - InputStream stream = new FileInputStream("test-data/document/HeaderFooterUnicode.doc"); + File file = new File("test-data/document/47304.doc"); + + InputStream stream = new FileInputStream(file); try { handleFile(stream); } finally { stream.close(); } + + handleExtracting(file); + + stream = new FileInputStream(file); + try { + WordExtractor extractor = new WordExtractor(stream); + try { + assertNotNull(extractor.getText()); + } finally { + extractor.close(); + } + } finally { + stream.close(); + } } } \ No newline at end of file diff --git a/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java b/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java index 929de311b7..ff53300321 100644 --- a/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java +++ b/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java @@ -177,7 +177,7 @@ public final class WordDocument { } else { - String sText = new String(_header, start, end-start); + String sText = new String(_header, start, end-start, "windows-1252"); out.write(sText); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java b/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java index 1cf29f4376..f0941674ff 100644 --- a/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java +++ b/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java @@ -17,6 +17,15 @@ package org.apache.poi.hdf.extractor; +import static org.junit.Assert.*; + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFTestDataSamples; +import org.apache.poi.hwpf.extractor.WordExtractor; import org.junit.Test; @@ -31,4 +40,31 @@ public class TestWordDocument { //WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"}); WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"}); } + + @SuppressWarnings("deprecation") + @Test + public void test47304() throws IOException { + HWPFDocument doc = HWPFTestDataSamples.openSampleFile("47304.doc"); + assertNotNull(doc); + + WordExtractor extractor = new WordExtractor(doc); + String text = extractor.getText(); + //System.out.println(text); + assertTrue("Had: " + text, text.contains("Just a \u201Ctest\u201D")); + extractor.close(); + + WordDocument wordDoc = new WordDocument("test-data/document/47304.doc"); + + StringWriter docTextWriter = new StringWriter(); + PrintWriter out = new PrintWriter(docTextWriter); + try { + wordDoc.writeAllText(out); + } finally { + out.close(); + } + docTextWriter.close(); + + //System.out.println(docTextWriter.toString()); + assertTrue("Had: " + docTextWriter.toString(), docTextWriter.toString().contains("Just a \u201Ctest\u201D")); + } } diff --git a/test-data/document/47304.doc b/test-data/document/47304.doc new file mode 100644 index 0000000000..d59d8d7ee1 Binary files /dev/null and b/test-data/document/47304.doc differ