]> source.dussan.org Git - poi.git/commitdiff
Add HPBF hyperlinks support to the extractor
authorNick Burch <nick@apache.org>
Sun, 31 Aug 2008 16:58:29 +0000 (16:58 +0000)
committerNick Burch <nick@apache.org>
Sun, 31 Aug 2008 16:58:29 +0000 (16:58 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690729 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/changes.xml
src/documentation/content/xdocs/status.xml
src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java
src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java

index 7165203f63fd7966aefa343f667f383c5604a6b3..d4ad48c822da36d1e04a3f5c0ef10cb7f61ac687 100644 (file)
@@ -37,6 +37,7 @@
 
                <!-- Don't forget to update status.xml too! -->
         <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
            <action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
            <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
            <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
index 8fe08bfec04f0f46c61d3844109c63202ee8de2b..e1f1361745bee1443dfdbb8e80753ca8195e6a9e 100644 (file)
@@ -34,6 +34,7 @@
        <!-- Don't forget to update changes.xml too! -->
     <changes>
         <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
            <action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
            <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
            <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
index 2257283a0fff1c994180999bbfe7f60f4d1a128d..a28f16b7e8d636d6446b43d939acacf1b4d9e9d4 100644 (file)
@@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hpbf.HPBFDocument;
 import org.apache.poi.hpbf.model.qcbits.QCBit;
 import org.apache.poi.hpbf.model.qcbits.QCTextBit;
+import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -31,6 +32,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  */
 public class PublisherTextExtractor extends POIOLE2TextExtractor {
        private HPBFDocument doc;
+       private boolean hyperlinksByDefault = false;
        
        public PublisherTextExtractor(HPBFDocument doc) {
                super(doc);
@@ -43,6 +45,16 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
                this(new POIFSFileSystem(is));
        }
        
+       /**
+        * Should a call to getText() return hyperlinks inline
+        *  with the text?
+        * Default is no
+        */
+       public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
+               this.hyperlinksByDefault = hyperlinksByDefault;
+       }
+
+       
        public String getText() {
                StringBuffer text = new StringBuffer();
                
@@ -55,6 +67,24 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
                        }
                }
                
+               // If requested, add in the hyperlinks
+               // Ideally, we'd do these inline, but the hyperlink
+               //  positions are relative to the text area the
+               //  hyperlink is in, and we have yet to figure out
+               //  how to tie that together.
+               if(hyperlinksByDefault) {
+                       for(int i=0; i<bits.length; i++) {
+                               if(bits[i] != null && bits[i] instanceof Type12) {
+                                       Type12 hyperlinks = (Type12)bits[i];
+                                       for(int j=0; j<hyperlinks.getNumberOfHyperlinks(); j++) {
+                                               text.append("<");
+                                               text.append(hyperlinks.getHyperlink(j));
+                                               text.append(">\n");
+                                       }
+                               }
+                       }
+               }
+               
                // Get more text
                // TODO
                
index a2eadc52d94c603b75d329824f00b3b3c5a17b0a..4bd57d514ac4a7c795cd75f0f7fc805c50293024 100644 (file)
@@ -167,6 +167,10 @@ public class QCPLCBit extends QCBit {
        
        /**
         * Type 12 holds hyperlinks, and is very complex.
+        * There is normally one of these for each text
+        *  area that contains at least one hyperlinks.
+        * The character offsets are relative to the start
+        *  of the text area that this applies to.
         */
        public static class Type12 extends QCPLCBit {
                private String[] hyperlinks;
@@ -249,6 +253,8 @@ public class QCPLCBit extends QCBit {
                 * Returns where in the text (in characters) the
                 *  hyperlink at the given index starts 
                 *  applying to.
+                * This position is relative to the text area that this
+                *  PLCBit applies to.
                 * @param number The hyperlink number, zero based
                 */
                public int getTextStartAt(int number) {
@@ -258,6 +264,8 @@ public class QCPLCBit extends QCBit {
                 * Returns where in the text that this block
                 *  of hyperlinks stops applying to. Normally,
                 *  but not always the end of the text.
+                * This position is relative to the text area that this
+                *  PLCBit applies to.
                 */
                public int getAllTextEndAt() {
                        return preData[numberOfPLCs+1];
index d5b4712227b2a8f1b6948b476ec69d3b818f830d..1289882254ddbb32f068adfcb8d96be9754fc977 100644 (file)
@@ -134,4 +134,41 @@ public class TextPublisherTextExtractor extends TestCase {
                assertEquals(s2007, s2000);
                assertEquals(s2007, s98);
        }
+       
+       /**
+        * Test that the hyperlink extraction stuff works as well
+        *  as we can hope it to.
+        */
+       public void testWithHyperlinks() throws Exception {
+               File f = new File(dir, "LinkAt10.pub");
+               HPBFDocument doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+
+               PublisherTextExtractor ext = 
+                       new PublisherTextExtractor(doc);
+               ext.getText();
+               
+               // Default is no hyperlinks
+               assertEquals("1234567890LINK\n", ext.getText());
+               
+               // Turn on
+               ext.setHyperlinksByDefault(true);
+               assertEquals("1234567890LINK\n<http://poi.apache.org/>\n", ext.getText());
+               
+               
+               // Now a much more complex document
+               f = new File(dir, "Sample.pub");
+               ext = new PublisherTextExtractor(new FileInputStream(f));
+               ext.setHyperlinksByDefault(true);
+               String text = ext.getText();
+               
+               assertTrue(text.endsWith(
+                               "<http://poi.apache.org/>\n" +
+                               "<C:\\Documents and Settings\\Nick\\My Documents\\Booleans.xlsx>\n" +
+                               "<>\n" +
+                               "<mailto:dev@poi.apache.org?subject=HPBF>\n" +
+                               "<mailto:dev@poi.apache.org?subject=HPBF>\n"
+               ));
+       }
 }