import static org.junit.Assert.assertNotNull;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.FileInputStream;
+import java.io.IOException;
import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import org.apache.poi.hdf.extractor.WordDocument;
import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.extractor.WordExtractor;
import org.junit.Test;
+@SuppressWarnings("deprecation")
public class HWPFFileHandler extends POIFSFileHandler {
@Override
public void handleFile(InputStream stream) throws Exception {
assertNotNull(doc.getEndnotes());
handlePOIDocument(doc);
+
+ // fails for many documents, but is deprecated anyway...
+ // handleWordDocument(doc);
+ }
+
+ protected void handleWordDocument(HWPFDocument doc) throws IOException {
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream();
+ doc.write(outStream);
+
+ WordDocument wordDoc = new WordDocument(new ByteArrayInputStream(outStream.toByteArray()));
+
+ StringWriter docTextWriter = new StringWriter();
+ PrintWriter out = new PrintWriter(docTextWriter);
+ try {
+ wordDoc.writeAllText(out);
+ } finally {
+ out.close();
+ }
+ docTextWriter.close();
}
+
+
// a test-case to test this locally without executing the full TestAllFiles
@Test
public void test() throws Exception {
- InputStream stream = new FileInputStream("test-data/document/HeaderFooterUnicode.doc");
+ File file = new File("test-data/document/47304.doc");
+
+ InputStream stream = new FileInputStream(file);
try {
handleFile(stream);
} finally {
stream.close();
}
+
+ handleExtracting(file);
+
+ stream = new FileInputStream(file);
+ try {
+ WordExtractor extractor = new WordExtractor(stream);
+ try {
+ assertNotNull(extractor.getText());
+ } finally {
+ extractor.close();
+ }
+ } finally {
+ stream.close();
+ }
}
}
\ No newline at end of file
package org.apache.poi.hdf.extractor;
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+import org.apache.poi.hwpf.extractor.WordExtractor;
import org.junit.Test;
//WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"});
WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"});
}
+
+ @SuppressWarnings("deprecation")
+ @Test
+ public void test47304() throws IOException {
+ HWPFDocument doc = HWPFTestDataSamples.openSampleFile("47304.doc");
+ assertNotNull(doc);
+
+ WordExtractor extractor = new WordExtractor(doc);
+ String text = extractor.getText();
+ //System.out.println(text);
+ assertTrue("Had: " + text, text.contains("Just a \u201Ctest\u201D"));
+ extractor.close();
+
+ WordDocument wordDoc = new WordDocument("test-data/document/47304.doc");
+
+ StringWriter docTextWriter = new StringWriter();
+ PrintWriter out = new PrintWriter(docTextWriter);
+ try {
+ wordDoc.writeAllText(out);
+ } finally {
+ out.close();
+ }
+ docTextWriter.close();
+
+ //System.out.println(docTextWriter.toString());
+ assertTrue("Had: " + docTextWriter.toString(), docTextWriter.toString().contains("Just a \u201Ctest\u201D"));
+ }
}