git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684273 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_5_BETA2
@@ -37,6 +37,8 @@ | |||
<!-- Don't forget to update status.xml too! --> | |||
<release version="3.5.1-beta2" date="2008-??-??"> | |||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action> | |||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action> | |||
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action> | |||
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action> | |||
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action> |
@@ -34,6 +34,8 @@ | |||
<!-- Don't forget to update changes.xml too! --> | |||
<changes> | |||
<release version="3.5.1-beta2" date="2008-??-??"> | |||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action> | |||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action> | |||
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action> | |||
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action> | |||
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action> |
@@ -39,7 +39,8 @@ public abstract class XWPFHeaderFooter { | |||
* Returns the paragraph(s) that holds | |||
* the text of the header or footer. | |||
* Normally there is only the one paragraph, but | |||
* there could be more in certain cases. | |||
* there could be more in certain cases, or | |||
* a table. | |||
*/ | |||
public XWPFParagraph[] getParagraphs() { | |||
XWPFParagraph[] paras = | |||
@@ -51,6 +52,24 @@ public abstract class XWPFHeaderFooter { | |||
} | |||
return paras; | |||
} | |||
/** | |||
* Return the table(s) that holds the text | |||
* of the header or footer, for complex cases | |||
* where a paragraph isn't used. | |||
* Normally there's just one paragraph, but some | |||
* complex headers/footers have a table or two | |||
* in addition. | |||
*/ | |||
public XWPFTable[] getTables() { | |||
XWPFTable[] tables = | |||
new XWPFTable[headerFooter.getTblArray().length]; | |||
for(int i=0; i<tables.length; i++) { | |||
tables[i] = new XWPFTable( | |||
headerFooter.getTblArray(i) | |||
); | |||
} | |||
return tables; | |||
} | |||
/** | |||
* Returns the textual content of the header/footer, | |||
@@ -58,11 +77,21 @@ public abstract class XWPFHeaderFooter { | |||
*/ | |||
public String getText() { | |||
StringBuffer t = new StringBuffer(); | |||
XWPFParagraph[] paras = getParagraphs(); | |||
for (int i = 0; i < paras.length; i++) { | |||
t.append(paras[i].getText()); | |||
for(int i=0; i<paras.length; i++) { | |||
if(! paras[i].isEmpty()) { | |||
t.append(paras[i].getText()); | |||
t.append('\n'); | |||
} | |||
} | |||
XWPFTable[] tables = getTables(); | |||
for(int i=0; i<tables.length; i++) { | |||
t.append(tables[i].getText()); | |||
t.append('\n'); | |||
} | |||
return t.toString(); | |||
} | |||
} |
@@ -16,6 +16,8 @@ | |||
==================================================================== */ | |||
package org.apache.poi.xwpf.usermodel; | |||
import java.util.ArrayList; | |||
import org.apache.poi.xwpf.XWPFDocument; | |||
import org.apache.poi.xwpf.model.XMLParagraph; | |||
import org.apache.xmlbeans.XmlCursor; | |||
@@ -24,6 +26,10 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; | |||
import org.w3c.dom.NodeList; | |||
import org.w3c.dom.Text; | |||
@@ -43,15 +49,37 @@ public class XWPFParagraph extends XMLParagraph | |||
public XWPFParagraph(CTP prgrph, XWPFDocument docRef) | |||
{ | |||
super(prgrph); | |||
this.docRef = docRef; | |||
// All the runs to loop over | |||
// TODO - replace this with some sort of XPath expression | |||
// to directly find all the CTRs, in the right order | |||
ArrayList<CTR> rs = new ArrayList<CTR>(); | |||
CTR[] tmp; | |||
// Get the main text runs | |||
tmp = paragraph.getRArray(); | |||
for(int i=0; i<tmp.length; i++) { | |||
rs.add(tmp[i]); | |||
} | |||
this.docRef = docRef; | |||
CTR[] rs = paragraph.getRArray(); | |||
// Not sure quite what these are, but they hold | |||
// more text runs | |||
CTSdtRun[] sdts = paragraph.getSdtArray(); | |||
for(int i=0; i<sdts.length; i++) { | |||
CTSdtContentRun run = sdts[i].getSdtContent(); | |||
tmp = run.getRArray(); | |||
for(int j=0; j<tmp.length; j++) { | |||
rs.add(tmp[j]); | |||
} | |||
} | |||
// Get text of the paragraph | |||
for (int j = 0; j < rs.length; j++) { | |||
for (int j = 0; j < rs.size(); j++) { | |||
// Grab the text and tabs of the paragraph | |||
// Do so in a way that preserves the ordering | |||
XmlCursor c = rs[j].newCursor(); | |||
XmlCursor c = rs.get(j).newCursor(); | |||
c.selectPath( "./*" ); | |||
while(c.toNextSelection()) { | |||
XmlObject o = c.getObject(); | |||
@@ -65,7 +93,7 @@ public class XWPFParagraph extends XMLParagraph | |||
// Loop over pictures inside our | |||
// paragraph, looking for text in them | |||
CTPicture[] picts = rs[j].getPictArray(); | |||
CTPicture[] picts = rs.get(j).getPictArray(); | |||
for (int k = 0; k < picts.length; k++) { | |||
XmlObject[] t = picts[k].selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t"); | |||
for (int m = 0; m < t.length; m++) { |
@@ -32,24 +32,26 @@ public class XWPFTable | |||
{ | |||
protected StringBuffer text=new StringBuffer(); | |||
public XWPFTable(CTTbl table) | |||
{ | |||
for(CTRow row : table.getTrArray()) | |||
{ | |||
for(CTTc cell : row.getTcArray()) | |||
{ | |||
for(CTP ctp : cell.getPArray()) | |||
{ | |||
public XWPFTable(CTTbl table) { | |||
for(CTRow row : table.getTrArray()) { | |||
StringBuffer rowText = new StringBuffer(); | |||
for(CTTc cell : row.getTcArray()) { | |||
for(CTP ctp : cell.getPArray()) { | |||
XWPFParagraph p = new XWPFParagraph(ctp); | |||
this.text.append(p.getText()+"\t"); | |||
if(rowText.length() > 0) { | |||
rowText.append('\t'); | |||
} | |||
rowText.append(p.getText()); | |||
} | |||
} | |||
this.text.append("\n"); | |||
if(rowText.length() > 0) { | |||
this.text.append(rowText); | |||
this.text.append('\n'); | |||
} | |||
} | |||
} | |||
public String getText() | |||
{ | |||
public String getText() { | |||
return text.toString(); | |||
} | |||
} |
@@ -111,7 +111,7 @@ public class TestXWPFWordExtractor extends TestCase { | |||
assertTrue(text.length() > 0); | |||
char euro = '\u20ac'; | |||
// System.err.println("'"+text.substring(text.length() - 20) + "'"); | |||
// System.err.println("'"+text.substring(text.length() - 40) + "'"); | |||
// Check contents | |||
assertTrue(text.startsWith( | |||
@@ -121,7 +121,7 @@ public class TestXWPFWordExtractor extends TestCase { | |||
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" | |||
)); | |||
assertTrue(text.endsWith( | |||
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n" | |||
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n" | |||
)); | |||
// Check number of paragraphs |
@@ -165,7 +165,7 @@ public class TestXWPFHeaderFooterPolicy extends TestCase { | |||
public void testContents() throws Exception { | |||
XWPFHeaderFooterPolicy policy; | |||
// Just test a few bits | |||
// Test a few simple bits off a simple header | |||
policy = diffFirst.getHeaderFooterPolicy(); | |||
assertEquals( | |||
@@ -176,5 +176,18 @@ public class TestXWPFHeaderFooterPolicy extends TestCase { | |||
"First header column!\tMid header\tRight header!\n", | |||
policy.getDefaultHeader().getText() | |||
); | |||
// And a few bits off a more complex header | |||
policy = oddEven.getHeaderFooterPolicy(); | |||
assertEquals( | |||
"\n[]ODD Page Header text\n\n", | |||
policy.getDefaultHeader().getText() | |||
); | |||
assertEquals( | |||
"\n[This is an Even Page, with a Header]\n\n", | |||
policy.getEvenPageHeader().getText() | |||
); | |||
} | |||
} |