]> source.dussan.org Git - poi.git/commitdiff
Enable Word6Extractor in ExtractorFactory
authorNick Burch <nick@apache.org>
Wed, 30 Jun 2010 16:08:10 +0000 (16:08 +0000)
committerNick Burch <nick@apache.org>
Wed, 30 Jun 2010 16:08:10 +0000 (16:08 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@959360 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java

index e3cbd0b1a1d3693dbee2be09fdf6e9742200e629..f55a8537106a96c8410eb017fca99b5b5c394298 100644 (file)
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-beta2" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
            <action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
            <action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
            <action dev="POI-DEVELOPERS" type="fix">XSLFSlideShow shouldn't break on .thmx (theme) files. Support for them is still very limited though</action>
index 6a2379e19a012a019394b6049deac6dc4ab622ab..2601f77aeae833baabd9e06ffb3089a736e75312 100644 (file)
@@ -38,6 +38,8 @@ import org.apache.poi.hsmf.datatypes.AttachmentChunks;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
 import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
@@ -218,7 +220,12 @@ public class ExtractorFactory {
                           }
                        }
                        if(entry.getName().equals("WordDocument")) {
-                               return new WordExtractor(poifsDir, fs);
+                           // Old or new style word document?
+                           try {
+                               return new WordExtractor(poifsDir, fs);
+                           } catch(OldWordFileFormatException e) {
+                               return new Word6Extractor(poifsDir, fs);
+                           }
                        }
                        if(entry.getName().equals("PowerPoint Document")) {
                                return new PowerPointExtractor(poifsDir, fs);
@@ -230,12 +237,12 @@ public class ExtractorFactory {
             return new PublisherTextExtractor(poifsDir, fs);
          }
                        if(
-                             entry.getName().equals("__substg1.0_1000001E") ||
-               entry.getName().equals("__substg1.0_1000001F") ||
-                             entry.getName().equals("__substg1.0_0047001E") ||
-               entry.getName().equals("__substg1.0_0047001F") ||
-                             entry.getName().equals("__substg1.0_0037001E") ||
-               entry.getName().equals("__substg1.0_0037001F")
+                entry.getName().equals("__substg1.0_1000001E") ||
+                entry.getName().equals("__substg1.0_1000001F") ||
+                entry.getName().equals("__substg1.0_0047001E") ||
+                entry.getName().equals("__substg1.0_0047001F") ||
+                entry.getName().equals("__substg1.0_0037001E") ||
+                entry.getName().equals("__substg1.0_0037001F")
                        ) {
                           return new OutlookTextExtactor(poifsDir, fs);
                        }
index f4f178f2272ebe49d38ca3a9ac7cd6c3a92eb998..0e4edeef9fa5aa01e6dd255616e8cd5b890f2d39 100644 (file)
@@ -29,6 +29,7 @@ import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
 import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
@@ -54,6 +55,8 @@ public class TestExtractorFactory extends TestCase {
    private File xlsEmb;
 
    private File doc;
+   private File doc6;
+   private File doc95;
    private File docx;
    private File dotx;
    private File docEmb;
@@ -79,6 +82,8 @@ public class TestExtractorFactory extends TestCase {
 
       POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
       doc = wpTests.getFile("SampleDoc.doc");
+      doc6 = wpTests.getFile("Word6.doc");
+      doc95 = wpTests.getFile("Word95.doc");
       docx = wpTests.getFile("SampleDoc.docx");
       dotx = wpTests.getFile("test.dotx");
       docEmb = wpTests.getFile("word_with_embeded.doc");
@@ -135,6 +140,23 @@ public class TestExtractorFactory extends TestCase {
             ExtractorFactory.createExtractor(doc).getText().length() > 120
       );
 
+      assertTrue(
+            ExtractorFactory.createExtractor(doc6)
+            instanceof Word6Extractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(doc6).getText().length() > 20
+      );
+      
+      assertTrue(
+            ExtractorFactory.createExtractor(doc95)
+            instanceof Word6Extractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(doc95).getText().length() > 120
+      );
+          
+        
       assertTrue(
             ExtractorFactory.createExtractor(docx)
             instanceof XWPFWordExtractor
@@ -231,6 +253,22 @@ public class TestExtractorFactory extends TestCase {
                                ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120
                );
                
+        assertTrue(
+                ExtractorFactory.createExtractor(new FileInputStream(doc6))
+                instanceof Word6Extractor
+        );
+        assertTrue(
+                ExtractorFactory.createExtractor(new FileInputStream(doc6)).getText().length() > 20
+        );
+        
+        assertTrue(
+                ExtractorFactory.createExtractor(new FileInputStream(doc95))
+                instanceof Word6Extractor
+        );
+        assertTrue(
+                ExtractorFactory.createExtractor(new FileInputStream(doc95)).getText().length() > 120
+        );
+        
                assertTrue(
                                ExtractorFactory.createExtractor(new FileInputStream(docx))
                                instanceof XWPFWordExtractor
@@ -311,6 +349,22 @@ public class TestExtractorFactory extends TestCase {
                                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
                );
                
+        assertTrue(
+                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
+                instanceof Word6Extractor
+        );
+        assertTrue(
+                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
+        );
+        
+        assertTrue(
+                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
+                instanceof Word6Extractor
+        );
+        assertTrue(
+                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
+        );
+        
                // PowerPoint
                assertTrue(
                                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
index bd31f6253dfcce15d00708dababf22ac9db2fc0b..00d1162e9077b8455d109585cf4fdd15e0c11508 100644 (file)
@@ -169,7 +169,7 @@ public final class HWPFDocument extends HWPFDocumentCore
     
     // Is this document too old for us?
     if(_fib.getNFib() < 106) {
-        throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
+        throw new OldWordFileFormatException("The document is too old - Word 95 or older. Try HWPFOldDocument instead?");
     }
 
     // use the fib to determine the name of the table stream.