<!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
+import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
*/
public class PublisherTextExtractor extends POIOLE2TextExtractor {
private HPBFDocument doc;
+ private boolean hyperlinksByDefault = false;
public PublisherTextExtractor(HPBFDocument doc) {
super(doc);
this(new POIFSFileSystem(is));
}
+ /**
+ * Should a call to getText() return hyperlinks inline
+ * with the text?
+ * Default is no
+ */
+ public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
+ this.hyperlinksByDefault = hyperlinksByDefault;
+ }
+
+
public String getText() {
StringBuffer text = new StringBuffer();
}
}
+ // If requested, add in the hyperlinks
+ // Ideally, we'd do these inline, but the hyperlink
+ // positions are relative to the text area the
+ // hyperlink is in, and we have yet to figure out
+ // how to tie that together.
+ if(hyperlinksByDefault) {
+ for(int i=0; i<bits.length; i++) {
+ if(bits[i] != null && bits[i] instanceof Type12) {
+ Type12 hyperlinks = (Type12)bits[i];
+ for(int j=0; j<hyperlinks.getNumberOfHyperlinks(); j++) {
+ text.append("<");
+ text.append(hyperlinks.getHyperlink(j));
+ text.append(">\n");
+ }
+ }
+ }
+ }
+
// Get more text
// TODO
/**
* Type 12 holds hyperlinks, and is very complex.
+ * There is normally one of these for each text
+ * area that contains at least one hyperlinks.
+ * The character offsets are relative to the start
+ * of the text area that this applies to.
*/
public static class Type12 extends QCPLCBit {
private String[] hyperlinks;
* Returns where in the text (in characters) the
* hyperlink at the given index starts
* applying to.
+ * This position is relative to the text area that this
+ * PLCBit applies to.
* @param number The hyperlink number, zero based
*/
public int getTextStartAt(int number) {
* Returns where in the text that this block
* of hyperlinks stops applying to. Normally,
* but not always the end of the text.
+ * This position is relative to the text area that this
+ * PLCBit applies to.
*/
public int getAllTextEndAt() {
return preData[numberOfPLCs+1];
assertEquals(s2007, s2000);
assertEquals(s2007, s98);
}
+
+ /**
+ * Test that the hyperlink extraction stuff works as well
+ * as we can hope it to.
+ */
+ public void testWithHyperlinks() throws Exception {
+ File f = new File(dir, "LinkAt10.pub");
+ HPBFDocument doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+
+ PublisherTextExtractor ext =
+ new PublisherTextExtractor(doc);
+ ext.getText();
+
+ // Default is no hyperlinks
+ assertEquals("1234567890LINK\n", ext.getText());
+
+ // Turn on
+ ext.setHyperlinksByDefault(true);
+ assertEquals("1234567890LINK\n<http://poi.apache.org/>\n", ext.getText());
+
+
+ // Now a much more complex document
+ f = new File(dir, "Sample.pub");
+ ext = new PublisherTextExtractor(new FileInputStream(f));
+ ext.setHyperlinksByDefault(true);
+ String text = ext.getText();
+
+ assertTrue(text.endsWith(
+ "<http://poi.apache.org/>\n" +
+ "<C:\\Documents and Settings\\Nick\\My Documents\\Booleans.xlsx>\n" +
+ "<>\n" +
+ "<mailto:dev@poi.apache.org?subject=HPBF>\n" +
+ "<mailto:dev@poi.apache.org?subject=HPBF>\n"
+ ));
+ }
}