]> source.dussan.org Git - poi.git/commitdiff
Add PublisherTextExtractor support to ExtractorFactory
authorNick Burch <nick@apache.org>
Mon, 11 Jan 2010 14:55:43 +0000 (14:55 +0000)
committerNick Burch <nick@apache.org>
Mon, 11 Jan 2010 14:55:43 +0000 (14:55 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897887 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java

index 2925ace088fec0af17d3ddb560fe617422a74931..5ee7b31f2f9b7e088a625c2509156111f39e8be9 100644 (file)
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Add PublisherTextExtractor support to ExtractorFactory</action>
            <action dev="POI-DEVELOPERS" type="add">Add XSLF support for text extraction from tables</action>
            <action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
            <action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
index 55d8499f5e2e805660ad9d96a01d77dab963472b..203c596574db913bca6753222c7baf9dfbbca76b 100644 (file)
@@ -31,6 +31,7 @@ import org.apache.poi.POITextExtractor;
 import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hsmf.MAPIMessage;
 import org.apache.poi.hsmf.datatypes.AttachmentChunks;
@@ -142,6 +143,9 @@ public class ExtractorFactory {
                        if(entry.getName().equals("VisioDocument")) {
                                return new VisioTextExtractor(poifsDir, fs);
                        }
+         if(entry.getName().equals("Quill")) {
+            return new PublisherTextExtractor(poifsDir, fs);
+         }
                        if(
                              entry.getName().equals("__substg1.0_1000001E") ||
                entry.getName().equals("__substg1.0_1000001F") ||
index d327c55828b4fcbe00f1fa14ab45928d2dd667c0..c127427a23a5150716d529a19c334e64eea265a8 100644 (file)
@@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.POIDataSamples;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
@@ -62,6 +63,8 @@ public class TestExtractorFactory extends TestCase {
    private File msgEmb;
    
    private File vsd;
+   
+   private File pub;
 
    protected void setUp() throws Exception {
       super.setUp();
@@ -86,6 +89,9 @@ public class TestExtractorFactory extends TestCase {
       POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
       vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
       
+      POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
+      pub = pubTests.getFile("Simple.pub");
+      
       POIDataSamples olTests = POIDataSamples.getHSMFInstance();
       msg = olTests.getFile("quick.msg");
       msgEmb = olTests.getFile("attachment_test_msg.msg");
@@ -169,6 +175,15 @@ public class TestExtractorFactory extends TestCase {
             ExtractorFactory.createExtractor(vsd).getText().length() > 50
       );
       
+      // Publisher
+      assertTrue(
+            ExtractorFactory.createExtractor(pub)
+            instanceof PublisherTextExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(pub).getText().length() > 50
+      );
+      
       // Outlook msg
       assertTrue(
             ExtractorFactory.createExtractor(msg)
@@ -248,6 +263,15 @@ public class TestExtractorFactory extends TestCase {
                                ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
                );
                
+      // Publisher
+      assertTrue(
+            ExtractorFactory.createExtractor(new FileInputStream(pub))
+            instanceof PublisherTextExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(new FileInputStream(pub)).getText().length() > 50
+      );
+      
                // Outlook msg
       assertTrue(
             ExtractorFactory.createExtractor(new FileInputStream(msg))
@@ -302,6 +326,15 @@ public class TestExtractorFactory extends TestCase {
                assertTrue(
                                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
                );
+      
+      // Publisher
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
+            instanceof PublisherTextExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
+      );
                
       // Outlook msg
       assertTrue(
@@ -426,6 +459,7 @@ public class TestExtractorFactory extends TestCase {
       assertEquals(1, numWord);
 
       // TODO - PowerPoint
+      // TODO - Publisher
       // TODO - Visio
    }
 }
index 8187ab124993419819bd4271d4c90fe6c930ccb6..5a1c1f43d6c4b005c78ff18382721dabd3ba4616 100644 (file)
@@ -26,6 +26,7 @@ import org.apache.poi.hpbf.HPBFDocument;
 import org.apache.poi.hpbf.model.qcbits.QCBit;
 import org.apache.poi.hpbf.model.qcbits.QCTextBit;
 import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -39,6 +40,9 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
                super(doc);
                this.doc = doc;
        }
+   public PublisherTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+      this(new HPBFDocument(dir, fs));
+   }
        public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
                this(new HPBFDocument(fs));
        }