Add HPBF hyperlinks support to the extractor

author Nick Burch <nick@apache.org>

Sun, 31 Aug 2008 16:58:29 +0000 (16:58 +0000)

committer Nick Burch <nick@apache.org>

Sun, 31 Aug 2008 16:58:29 +0000 (16:58 +0000)
author Nick Burch <nick@apache.org>
Sun, 31 Aug 2008 16:58:29 +0000 (16:58 +0000)
committer Nick Burch <nick@apache.org>
Sun, 31 Aug 2008 16:58:29 +0000 (16:58 +0000)
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml

index 7165203f63fd7966aefa343f667f383c5604a6b3..d4ad48c822da36d1e04a3f5c0ef10cb7f61ac687 100644 (file)
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -37,6 +37,7 @@
  
                 <!-- Don't forget to update status.xml too! -->
          <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
             <action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
             <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
             <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 8fe08bfec04f0f46c61d3844109c63202ee8de2b..e1f1361745bee1443dfdbb8e80753ca8195e6a9e 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
         <!-- Don't forget to update changes.xml too! -->
      <changes>
          <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
             <action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
             <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
             <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
diff --git a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java

index 2257283a0fff1c994180999bbfe7f60f4d1a128d..a28f16b7e8d636d6446b43d939acacf1b4d9e9d4 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
@@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
  import org.apache.poi.hpbf.HPBFDocument;
  import org.apache.poi.hpbf.model.qcbits.QCBit;
  import org.apache.poi.hpbf.model.qcbits.QCTextBit;
+import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  /**
@@ -31,6 +32,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
   */
  public class PublisherTextExtractor extends POIOLE2TextExtractor {
         private HPBFDocument doc;
+       private boolean hyperlinksByDefault = false;
         
         public PublisherTextExtractor(HPBFDocument doc) {
                 super(doc);
@@ -43,6 +45,16 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
                 this(new POIFSFileSystem(is));
         }
         
+       /**
+        * Should a call to getText() return hyperlinks inline
+        *  with the text?
+        * Default is no
+        */
+       public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
+               this.hyperlinksByDefault = hyperlinksByDefault;
+       }
+
+       
         public String getText() {
                 StringBuffer text = new StringBuffer();
                 
@@ -55,6 +67,24 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
                         }
                 }
                 
+               // If requested, add in the hyperlinks
+               // Ideally, we'd do these inline, but the hyperlink
+               //  positions are relative to the text area the
+               //  hyperlink is in, and we have yet to figure out
+               //  how to tie that together.
+               if(hyperlinksByDefault) {
+                       for(int i=0; i<bits.length; i++) {
+                               if(bits[i] != null && bits[i] instanceof Type12) {
+                                       Type12 hyperlinks = (Type12)bits[i];
+                                       for(int j=0; j<hyperlinks.getNumberOfHyperlinks(); j++) {
+                                               text.append("<");
+                                               text.append(hyperlinks.getHyperlink(j));
+                                               text.append(">\n");
+                                       }
+                               }
+                       }
+               }
+               
                 // Get more text
                 // TODO
                 
diff --git a/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java b/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java

index a2eadc52d94c603b75d329824f00b3b3c5a17b0a..4bd57d514ac4a7c795cd75f0f7fc805c50293024 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java
+++ b/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java
@@ -167,6 +167,10 @@ public class QCPLCBit extends QCBit {
         
         /**
          * Type 12 holds hyperlinks, and is very complex.
+        * There is normally one of these for each text
+        *  area that contains at least one hyperlinks.
+        * The character offsets are relative to the start
+        *  of the text area that this applies to.
          */
         public static class Type12 extends QCPLCBit {
                 private String[] hyperlinks;
@@ -249,6 +253,8 @@ public class QCPLCBit extends QCBit {
                  * Returns where in the text (in characters) the
                  *  hyperlink at the given index starts 
                  *  applying to.
+                * This position is relative to the text area that this
+                *  PLCBit applies to.
                  * @param number The hyperlink number, zero based
                  */
                 public int getTextStartAt(int number) {
@@ -258,6 +264,8 @@ public class QCPLCBit extends QCBit {
                  * Returns where in the text that this block
                  *  of hyperlinks stops applying to. Normally,
                  *  but not always the end of the text.
+                * This position is relative to the text area that this
+                *  PLCBit applies to.
                  */
                 public int getAllTextEndAt() {
                         return preData[numberOfPLCs+1];
diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java

index d5b4712227b2a8f1b6948b476ec69d3b818f830d..1289882254ddbb32f068adfcb8d96be9754fc977 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
@@ -134,4 +134,41 @@ public class TextPublisherTextExtractor extends TestCase {
                 assertEquals(s2007, s2000);
                 assertEquals(s2007, s98);
         }
+       
+       /**
+        * Test that the hyperlink extraction stuff works as well
+        *  as we can hope it to.
+        */
+       public void testWithHyperlinks() throws Exception {
+               File f = new File(dir, "LinkAt10.pub");
+               HPBFDocument doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+
+               PublisherTextExtractor ext = 
+                       new PublisherTextExtractor(doc);
+               ext.getText();
+               
+               // Default is no hyperlinks
+               assertEquals("1234567890LINK\n", ext.getText());
+               
+               // Turn on
+               ext.setHyperlinksByDefault(true);
+               assertEquals("1234567890LINK\n<http://poi.apache.org/>\n", ext.getText());
+               
+               
+               // Now a much more complex document
+               f = new File(dir, "Sample.pub");
+               ext = new PublisherTextExtractor(new FileInputStream(f));
+               ext.setHyperlinksByDefault(true);
+               String text = ext.getText();
+               
+               assertTrue(text.endsWith(
+                               "<http://poi.apache.org/>\n" +
+                               "<C:\\Documents and Settings\\Nick\\My Documents\\Booleans.xlsx>\n" +
+                               "<>\n" +
+                               "<mailto:dev@poi.apache.org?subject=HPBF>\n" +
+                               "<mailto:dev@poi.apache.org?subject=HPBF>\n"
+               ));
+       }
  }
author	Nick Burch <nick@apache.org>
	Sun, 31 Aug 2008 16:58:29 +0000 (16:58 +0000)
committer	Nick Burch <nick@apache.org>
	Sun, 31 Aug 2008 16:58:29 +0000 (16:58 +0000)
src/documentation/content/xdocs/changes.xml		patch \| blob \| history
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java		patch \| blob \| history