<changes>
<release version="3.7-beta3" date="2010-??-??">
+ <action dev="poi-developers" type="fix">Improve handling of Hyperlinks inside XWPFParagraph objects through XWPFHyperlinkRun</action>
+ <action dev="poi-developers" type="fix">Make XWPFParagraph make more use of XWPFRun, and less on internal StringBuffers</action>
<action dev="poi-developers" type="add">Add a getBodyElements() method to XWPF IBody, to make access to embedded paragraphs and tables easier</action>
<action dev="poi-developers" type="add">More XSLFRelation entries for common .pptx file parts</action>
<action dev="poi-developers" type="fix">49872 - avoid exception in XSSFFormulaEvaluator.evaluateInCell when evaluating shared formulas</action>
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
extractHeaders(text, headerFooterPolicy);
}
- XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
- new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
- text.append(decorator.getText()).append('\n');
+ // Do the paragraph text
+ for(XWPFRun run : paragraph.getRuns()) {
+ text.append(run.toString());
+ if(run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
+ XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document);
+ if(link != null)
+ text.append(" <" + link.getURL() + ">");
+ }
+ }
+
+ // Add comments
+ XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null);
+ text.append(decorator.getCommentText()).append('\n');
+
+ // Do endnotes, footnotes and pictures
+ for(String str : new String[] {
+ paragraph.getFootnoteText(), paragraph.getPictureText()
+ }) {
+ if(str != null && str.length() > 0) {
+ text.append(str + "\n");
+ }
+ }
if (ctSectPr!=null) {
extractFooters(text, headerFooterPolicy);
}
}
+ public String getCommentText() {
+ return commentText.toString();
+ }
+
public String getText() {
return super.getText() + commentText;
}
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
/**
* Decorator class for XWPFParagraph allowing to add hyperlinks
* found in paragraph to its text.
*
- * TODO - add the hyperlink text in the right place, and not just
- * at the end
+ * Note - adds the hyperlink at the end, not in the right place...
+ *
+ * @deprecated Use {@link XWPFHyperlinkRun} instead
*/
+@Deprecated
public class XWPFHyperlinkDecorator extends XWPFParagraphDecorator {
private StringBuffer hyperlinkText;
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.usermodel;
+
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+
+/**
+ * A run of text with a Hyperlink applied to it.
+ * Any given Hyperlink may be made up of multiple of these.
+ */
+public class XWPFHyperlinkRun extends XWPFRun
+{
+ private CTHyperlink hyperlink;
+
+ public XWPFHyperlinkRun(CTHyperlink hyperlink, CTR run, XWPFParagraph p) {
+ super(run, p);
+ this.hyperlink = hyperlink;
+ }
+
+ public CTHyperlink getCTHyperlink() {
+ return hyperlink;
+ }
+
+ public String getAnchor() {
+ return hyperlink.getAnchor();
+ }
+
+ /**
+ * Returns the ID of the hyperlink, if one is set.
+ */
+ public String getHyperlinkId() {
+ return hyperlink.getId();
+ }
+ public void setHyperlinkId(String id) {
+ hyperlink.setId(id);
+ }
+
+ /**
+ * If this Hyperlink is an external reference hyperlink,
+ * return the object for it.
+ */
+ public XWPFHyperlink getHyperlink(XWPFDocument document) {
+ String id = getHyperlinkId();
+ if(id == null)
+ return null;
+
+ return document.getHyperlinkByID(id);
+ }
+}
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-import java.util.Arrays;
import org.apache.poi.util.Internal;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdnRef;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPBdr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPPr;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTProofErr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
protected XWPFDocument document;
protected List<XWPFRun> runs;
- /**
- * TODO - replace with RichText String
- */
- private StringBuffer text = new StringBuffer();
private StringBuffer pictureText = new StringBuffer();
private StringBuffer footnoteText = new StringBuffer();
}
runs = new ArrayList<XWPFRun>();
- if (prgrph.getRList().size() > 0) {
- for(CTR ctRun : prgrph.getRList()) {
- runs.add(new XWPFRun(ctRun, this));
- }
- }
-
- if (!isEmpty()) {
- readNewText();
- }
- }
-
- protected String readNewText() {
- StringBuffer text = new StringBuffer();
-
- // All the runs to loop over
- // TODO - replace this with some sort of XPath expression
- // to directly find all the CTRs, in the right order
- ArrayList<CTR> rs = new ArrayList<CTR>();
- rs.addAll( paragraph.getRList() );
-
- for (CTSdtRun sdt : paragraph.getSdtList()) {
- CTSdtContentRun run = sdt.getSdtContent();
- rs.addAll( run.getRList() );
- }
- for (CTRunTrackChange c : paragraph.getDelList()) {
- rs.addAll( c.getRList() );
- }
- for (CTRunTrackChange c : paragraph.getInsList()) {
- rs.addAll( c.getRList() );
- }
- for (CTSimpleField f : paragraph.getFldSimpleList()) {
- rs.addAll( f.getRList() );
- }
- // Get text of the paragraph
- for (int j = 0; j < rs.size(); j++) {
- // Grab the text and tabs of the paragraph
- // Do so in a way that preserves the ordering
- XmlCursor c = rs.get(j).newCursor();
- c.selectPath("./*");
- while (c.toNextSelection()) {
- XmlObject o = c.getObject();
- if (o instanceof CTText) {
- String tagName = o.getDomNode().getNodeName();
- // Field Codes (w:instrText, defined in spec sec. 17.16.23)
- // come up as instances of CTText, but we don't want them
- // in the normal text output
- if (!"w:instrText".equals(tagName)) {
- text.append(((CTText) o).getStringValue());
- }
- }
- if (o instanceof CTPTab) {
- text.append("\t");
- }
- if (o instanceof CTEmpty) {
- // Some inline text elements get returned not as
- // themselves, but as CTEmpty, owing to some odd
- // definitions around line 5642 of the XSDs
- String tagName = o.getDomNode().getNodeName();
- if ("w:tab".equals(tagName)) {
- text.append("\t");
- }
- if ("w:cr".equals(tagName)) {
- text.append("\n");
- }
- }
-
- // Check for bits that only apply when
- // attached to a core document
- if(document != null) {
- //got a reference to a footnote
- if (o instanceof CTFtnEdnRef) {
- CTFtnEdnRef ftn = (CTFtnEdnRef) o;
- footnoteText.append("[").append(ftn.getId()).append(": ");
- XWPFFootnote footnote =
- ftn.getDomNode().getLocalName().equals("footnoteReference") ?
- document.getFootnoteByID(ftn.getId().intValue()) :
- document.getEndnoteByID(ftn.getId().intValue());
+ // Get all our child nodes in order, and process them
+ // into XWPFRuns where we can
+ XmlCursor c = paragraph.newCursor();
+ c.selectPath("child::*");
+ while (c.toNextSelection()) {
+ XmlObject o = c.getObject();
+ if(o instanceof CTR) {
+ runs.add(new XWPFRun((CTR)o, this));
+ }
+ if(o instanceof CTHyperlink) {
+ CTHyperlink link = (CTHyperlink)o;
+ for(CTR r : link.getRList()) {
+ runs.add(new XWPFHyperlinkRun(link, r, this));
+ }
+ }
+ if(o instanceof CTSdtRun) {
+ CTSdtContentRun run = ((CTSdtRun)o).getSdtContent();
+ for(CTR r : run.getRList()) {
+ runs.add(new XWPFRun(r, this));
+ }
+ }
+ if(o instanceof CTRunTrackChange) {
+ for(CTR r : ((CTRunTrackChange)o).getRList()) {
+ runs.add(new XWPFRun(r, this));
+ }
+ }
+ if(o instanceof CTSimpleField) {
+ for(CTR r : ((CTSimpleField)o).getRList()) {
+ runs.add(new XWPFRun(r, this));
+ }
+ }
+ }
+
+ // Look for bits associated with the runs
+ for(XWPFRun run : runs) {
+ CTR r = run.getCTR();
+
+ // Check for bits that only apply when
+ // attached to a core document
+ if(document != null) {
+ c = r.newCursor();
+ c.selectPath("child::*");
+ while (c.toNextSelection()) {
+ XmlObject o = c.getObject();
+ if(o instanceof CTFtnEdnRef) {
+ CTFtnEdnRef ftn = (CTFtnEdnRef)o;
+ footnoteText.append("[").append(ftn.getId()).append(": ");
+ XWPFFootnote footnote =
+ ftn.getDomNode().getLocalName().equals("footnoteReference") ?
+ document.getFootnoteByID(ftn.getId().intValue()) :
+ document.getEndnoteByID(ftn.getId().intValue());
- boolean first = true;
- for (XWPFParagraph p : footnote.getParagraphs()) {
- if (!first) {
- footnoteText.append("\n");
- first = false;
- }
- footnoteText.append(p.getText());
- }
+ boolean first = true;
+ for (XWPFParagraph p : footnote.getParagraphs()) {
+ if (!first) {
+ footnoteText.append("\n");
+ first = false;
+ }
+ footnoteText.append(p.getText());
+ }
- footnoteText.append("]");
- }
- }
+ footnoteText.append("]");
+ }
+ }
}
// Loop over pictures inside our
// paragraph, looking for text in them
- for(CTPicture pict : rs.get(j).getPictList()) {
+ for(CTPicture pict : r.getPictList()) {
XmlObject[] t = pict
.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t");
for (int m = 0; m < t.length; m++) {
}
}
}
-
- this.text = text;
- return text.toString();
}
@Internal
*/
public String getText() {
StringBuffer out = new StringBuffer();
- out.append(text).append(footnoteText).append(pictureText);
+ for(XWPFRun run : runs) {
+ out.append(run.toString());
+ }
+ out.append(footnoteText).append(pictureText);
return out.toString();
}
* paragraph
*/
public String getParagraphText() {
- return text.toString();
+ StringBuffer out = new StringBuffer();
+ for(XWPFRun run : runs) {
+ out.append(run.toString());
+ }
+ return out.toString();
}
/**
pos = paragraph.getRList().size();
paragraph.addNewR();
paragraph.setRArray(pos, run);
- for (CTText ctText: paragraph.getRArray(pos).getTList()) {
- this.text.append(ctText.getStringValue());
- }
}
/**
import java.math.BigInteger;
import org.apache.poi.util.Internal;
+import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.XmlString;
import org.apache.xmlbeans.XmlCursor;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFonts;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHpsMeasure;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
}
}
+ /**
+ * Returns the string version of the text, with tabs and
+ * carriage returns in place of their xml equivalents.
+ */
+ public String toString() {
+ StringBuffer text = new StringBuffer();
+
+ // Grab the text and tabs of the text run
+ // Do so in a way that preserves the ordering
+ XmlCursor c = run.newCursor();
+ c.selectPath("./*");
+ while (c.toNextSelection()) {
+ XmlObject o = c.getObject();
+ if (o instanceof CTText) {
+ String tagName = o.getDomNode().getNodeName();
+ // Field Codes (w:instrText, defined in spec sec. 17.16.23)
+ // come up as instances of CTText, but we don't want them
+ // in the normal text output
+ if (!"w:instrText".equals(tagName)) {
+ text.append(((CTText) o).getStringValue());
+ }
+ }
+ if (o instanceof CTPTab) {
+ text.append("\t");
+ }
+ if (o instanceof CTEmpty) {
+ // Some inline text elements get returned not as
+ // themselves, but as CTEmpty, owing to some odd
+ // definitions around line 5642 of the XSDs
+ String tagName = o.getDomNode().getNodeName();
+ if ("w:tab".equals(tagName)) {
+ text.append("\t");
+ }
+ if ("w:cr".equals(tagName)) {
+ text.append("\n");
+ }
+ }
+ }
+
+ return text.toString();
+ }
}
public String getText(){
StringBuffer text = new StringBuffer();
for (XWPFParagraph p : paragraphs) {
- text.append(p.readNewText());
+ text.append(p.getText());
}
return text.toString();
}
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
// Now check contents
- // TODO - fix once correctly handling contents
extractor.setFetchHyperlinks(false);
assertEquals(
-// "This is a test document\nThis bit is in bold and italic\n" +
-// "Back to normal\nWe have a hyperlink here, and another.\n",
- "This is a test document\nThis bit is in bold and italic\n" +
- "Back to normal\nWe have a here, and .hyperlinkanother\n",
+ "This is a test document\nThis bit is in bold and italic\n" +
+ "Back to normal\nWe have a hyperlink here, and another.\n",
extractor.getText()
);
+ // One hyperlink is a real one, one is just to the top of page
extractor.setFetchHyperlinks(true);
assertEquals(
-// "This is a test document\nThis bit is in bold and italic\n" +
-// "Back to normal\nWe have a hyperlink here, and another.\n",
- "This is a test document\nThis bit is in bold and italic\n" +
- "Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
+ "This is a test document\nThis bit is in bold and italic\n" +
+ "Back to normal\nWe have a hyperlink <http://poi.apache.org/> here, and another.\n",
extractor.getText()
);
}
policy = oddEven.getHeaderFooterPolicy();
assertEquals(
- "[]ODD Page Header text\n\n",
+ "[ODD Page Header text]\n\n",
policy.getDefaultHeader().getText()
);
assertEquals(