]> source.dussan.org Git - poi.git/commitdiff
Bug 47304: use fixed encoding when extracting text in WordDocument
authorDominik Stadler <centic@apache.org>
Sun, 22 Mar 2015 13:33:43 +0000 (13:33 +0000)
committerDominik Stadler <centic@apache.org>
Sun, 22 Mar 2015 13:33:43 +0000 (13:33 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1668367 13f79535-47bb-0310-9956-ffa450edef68

src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java
src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java
src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java
test-data/document/47304.doc [new file with mode: 0644]

index 1b6d4646c70d7ffc8a7b2ed37e304e1f86151c6e..5f24337fbb722a33c150e6723a3a122c6f16e5fb 100644 (file)
@@ -18,12 +18,21 @@ package org.apache.poi.stress;
 
 import static org.junit.Assert.assertNotNull;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
 import java.io.FileInputStream;
+import java.io.IOException;
 import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.StringWriter;
 
+import org.apache.poi.hdf.extractor.WordDocument;
 import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.junit.Test;
 
+@SuppressWarnings("deprecation")
 public class HWPFFileHandler extends POIFSFileHandler {
        @Override
        public void handleFile(InputStream stream) throws Exception {
@@ -33,16 +42,53 @@ public class HWPFFileHandler extends POIFSFileHandler {
                assertNotNull(doc.getEndnotes());
                
                handlePOIDocument(doc);
+               
+               // fails for many documents, but is deprecated anyway... 
+               // handleWordDocument(doc);
+       }
+
+       protected void handleWordDocument(HWPFDocument doc) throws IOException {
+               ByteArrayOutputStream outStream = new ByteArrayOutputStream();
+               doc.write(outStream);
+
+               WordDocument wordDoc = new WordDocument(new ByteArrayInputStream(outStream.toByteArray()));
+        
+        StringWriter docTextWriter = new StringWriter();
+        PrintWriter out = new PrintWriter(docTextWriter);
+        try {
+               wordDoc.writeAllText(out);
+        } finally {
+               out.close();
+        }
+        docTextWriter.close();
        }
 
+
+
        // a test-case to test this locally without executing the full TestAllFiles
        @Test
        public void test() throws Exception {
-               InputStream stream = new FileInputStream("test-data/document/HeaderFooterUnicode.doc");
+               File file = new File("test-data/document/47304.doc");
+
+               InputStream stream = new FileInputStream(file);
                try {
                        handleFile(stream);
                } finally {
                        stream.close();
                }
+               
+               handleExtracting(file);
+               
+               stream = new FileInputStream(file);
+               try {
+                       WordExtractor extractor = new WordExtractor(stream);
+                       try {
+                               assertNotNull(extractor.getText());
+                       } finally {
+                               extractor.close();
+                       }
+               } finally {
+                       stream.close();
+               }
        }
 }
\ No newline at end of file
index 929de311b728c4e6d880042602f9da76ebc24fa9..ff5330032172e97f03da004178ebd9934697ce45 100644 (file)
@@ -177,7 +177,7 @@ public final class WordDocument {
       }
       else
       {
-       String sText = new String(_header, start, end-start);
+       String sText = new String(_header, start, end-start, "windows-1252");
        out.write(sText);
       }
     }
index 1cf29f4376290eb1f7e4a582d35ef51431a58281..f0941674ffe20ea597ad4b0429b42b29274290c8 100644 (file)
 
 package org.apache.poi.hdf.extractor;
 
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.junit.Test;
 
 
@@ -31,4 +40,31 @@ public class TestWordDocument {
         //WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"});
         WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"});
     }
+
+    @SuppressWarnings("deprecation")
+       @Test
+    public void test47304() throws IOException {
+       HWPFDocument doc = HWPFTestDataSamples.openSampleFile("47304.doc");
+       assertNotNull(doc);
+       
+       WordExtractor extractor = new WordExtractor(doc);
+        String text = extractor.getText();
+        //System.out.println(text);
+        assertTrue("Had: " + text, text.contains("Just  a \u201Ctest\u201D"));
+        extractor.close();
+        
+               WordDocument wordDoc = new WordDocument("test-data/document/47304.doc");
+        
+        StringWriter docTextWriter = new StringWriter();
+        PrintWriter out = new PrintWriter(docTextWriter);
+        try {
+               wordDoc.writeAllText(out);
+        } finally {
+               out.close();
+        }
+        docTextWriter.close();
+
+        //System.out.println(docTextWriter.toString());
+        assertTrue("Had: " + docTextWriter.toString(), docTextWriter.toString().contains("Just  a \u201Ctest\u201D"));
+    }
 }
diff --git a/test-data/document/47304.doc b/test-data/document/47304.doc
new file mode 100644 (file)
index 0000000..d59d8d7
Binary files /dev/null and b/test-data/document/47304.doc differ