Browse Source

XSLF: text extraction from tables

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897875 13f79535-47bb-0310-9956-ffa450edef68
tags/REL_3_7_BETA1
Maxim Valyanskiy 14 years ago
parent
commit
b1c8c26708

+ 14
- 44
src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java View File

@@ -16,28 +16,18 @@
==================================================================== */
package org.apache.poi.xslf.extractor;

import java.io.IOException;

import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.XmlCursor;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextLineBreak;
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
import org.openxmlformats.schemas.presentationml.x2006.main.*;

import java.io.IOException;

public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
private XMLSlideShow slideshow;
@@ -110,7 +100,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
slideshow._getXSLFSlideShow().getSlideComments(slideId);
if(slideText) {
extractText(rawSlide.getCSld().getSpTree(), text);
extractText(slides[i].getCommonSlideData(), text);
// Comments too for the slide
if(comments != null) {
@@ -123,8 +113,9 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
}
}
}

if(notesText && notes != null) {
extractText(notes.getCSld().getSpTree(), text);
extractText(new XSLFCommonSlideData(notes.getCSld()), text);
}
} catch(Exception e) {
throw new RuntimeException(e);
@@ -134,31 +125,10 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
return text.toString();
}
private void extractText(CTGroupShape gs, StringBuffer text) {
CTShape[] shapes = gs.getSpArray();
for (int i = 0; i < shapes.length; i++) {
CTTextBody textBody =
shapes[i].getTxBody();
if(textBody != null) {
CTTextParagraph[] paras =
textBody.getPArray();
for (int j = 0; j < paras.length; j++) {
XmlCursor c = paras[j].newCursor();
c.selectPath("./*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if(o instanceof CTRegularTextRun){
CTRegularTextRun txrun = (CTRegularTextRun)o;
text.append( txrun.getT() );
} else if (o instanceof CTTextLineBreak){
text.append('\n');
}
}
// End each paragraph with a new line
text.append("\n");
}
}
}
}
private void extractText(XSLFCommonSlideData data, StringBuffer text) {
for (DrawingParagraph p : data.getText()) {
text.append(p.getText());
text.append("\n");
}
}
}

+ 33
- 0
src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingParagraph.java View File

@@ -0,0 +1,33 @@
package org.apache.poi.xslf.usermodel;

import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextLineBreak;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;

public class DrawingParagraph {
private final CTTextParagraph p;

public DrawingParagraph(CTTextParagraph p) {
this.p = p;
}

public CharSequence getText() {
StringBuilder text = new StringBuilder();

XmlCursor c = p.newCursor();
c.selectPath("./*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTRegularTextRun) {
CTRegularTextRun txrun = (CTRegularTextRun) o;
text.append(txrun.getT());
} else if (o instanceof CTTextLineBreak) {
text.append('\n');
}
}
return text;
}
}

+ 23
- 0
src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTable.java View File

@@ -0,0 +1,23 @@
package org.apache.poi.xslf.usermodel;

import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTableRow;

public class DrawingTable {
private final CTTable table;

public DrawingTable(CTTable table) {
this.table = table;
}

public DrawingTableRow[] getRows() {
CTTableRow[] ctTableRows = table.getTrArray();
DrawingTableRow[] o = new DrawingTableRow[ctTableRows.length];

for (int i=0; i<o.length; i++) {
o[i] = new DrawingTableRow(ctTableRows[i]);
}

return o;
}
}

+ 17
- 0
src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTableCell.java View File

@@ -0,0 +1,17 @@
package org.apache.poi.xslf.usermodel;

import org.openxmlformats.schemas.drawingml.x2006.main.CTTableCell;

public class DrawingTableCell {
private final CTTableCell cell;
private final DrawingTextBody drawingTextBody;

public DrawingTableCell(CTTableCell cell) {
this.cell = cell;
drawingTextBody = new DrawingTextBody(this.cell.getTxBody());
}

public DrawingTextBody getTextBody() {
return drawingTextBody;
}
}

+ 23
- 0
src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTableRow.java View File

@@ -0,0 +1,23 @@
package org.apache.poi.xslf.usermodel;

import org.openxmlformats.schemas.drawingml.x2006.main.CTTableRow;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTableCell;

public class DrawingTableRow {
private final CTTableRow row;

public DrawingTableRow(CTTableRow row) {
this.row = row;
}

public DrawingTableCell[] getCells() {
CTTableCell[] ctTableCells = row.getTcArray();
DrawingTableCell[] o = new DrawingTableCell[ctTableCells.length];

for (int i=0; i<o.length; i++) {
o[i] = new DrawingTableCell(ctTableCells[i]);
}

return o;
}
}

+ 23
- 0
src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextBody.java View File

@@ -0,0 +1,23 @@
package org.apache.poi.xslf.usermodel;

import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;

public class DrawingTextBody {
private final CTTextBody textBody;

public DrawingTextBody(CTTextBody textBody) {
this.textBody = textBody;
}

public DrawingParagraph[] getParagraphs() {
CTTextParagraph[] pArray = textBody.getPArray();
DrawingParagraph[] o = new DrawingParagraph[pArray.length];

for (int i=0; i<o.length; i++) {
o[i] = new DrawingParagraph(pArray[i]);
}

return o;
}
}

+ 67
- 0
src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFCommonSlideData.java View File

@@ -0,0 +1,67 @@
package org.apache.poi.xslf.usermodel;

import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class XSLFCommonSlideData {
private final CTCommonSlideData data;

public XSLFCommonSlideData(CTCommonSlideData data) {
this.data = data;
}

public List<DrawingParagraph> getText() {
CTGroupShape gs = data.getSpTree();

List<DrawingParagraph> out = new ArrayList<DrawingParagraph>();

CTShape[] shapes = gs.getSpArray();
for (int i = 0; i < shapes.length; i++) {
CTTextBody ctTextBody = shapes[i].getTxBody();
if (ctTextBody==null) {
continue;
}

DrawingTextBody textBody = new DrawingTextBody(ctTextBody);

out.addAll(Arrays.asList(textBody.getParagraphs()));
}

CTGraphicalObjectFrame[] graphicFrames = gs.getGraphicFrameArray();
for (CTGraphicalObjectFrame frame: graphicFrames) {
CTGraphicalObjectData data = frame.getGraphic().getGraphicData();
XmlCursor c = data.newCursor();
c.selectPath("./*");

while (c.toNextSelection()) {
XmlObject o = c.getObject();

if (o instanceof CTTable) {
DrawingTable table = new DrawingTable((CTTable) o);

for (DrawingTableRow row : table.getRows()) {
for (DrawingTableCell cell : row.getCells()) {
DrawingTextBody textBody = cell.getTextBody();

out.addAll(Arrays.asList(textBody.getParagraphs()));
}
}
}
}
}

return out;
}

}

+ 6
- 0
src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFSlide.java View File

@@ -26,11 +26,13 @@ import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
public class XSLFSlide extends XSLFSheet implements Slide {
private CTSlide slide;
private CTSlideIdListEntry slideId;
private XSLFCommonSlideData data;
public XSLFSlide(CTSlide slide, CTSlideIdListEntry slideId, SlideShow parent) {
super(parent);
this.slide = slide;
this.slideId = slideId;
this.data = new XSLFCommonSlideData(slide.getCSld());
}
/**
@@ -88,4 +90,8 @@ public class XSLFSlide extends XSLFSheet implements Slide {
// TODO Auto-generated method stub

}

public XSLFCommonSlideData getCommonSlideData() {
return data;
}
}

+ 13
- 0
src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java View File

@@ -113,4 +113,17 @@ public class TestXSLFPowerPointExtractor extends TestCase {
// Check comments are there
assertTrue("Unable to find expected word in text\n" + text, text.contains("testdoc"));
}

public void testTable() throws Exception {
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
xmlA = new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("present1.pptx")));
XSLFPowerPointExtractor extractor =
new XSLFPowerPointExtractor(xmlA);

String text = extractor.getText();
assertTrue(text.length() > 0);

// Check comments are there
assertTrue("Unable to find expected word in text\n" + text, text.contains("TEST"));
}
}

BIN
test-data/slideshow/present1.pptx View File


Loading…
Cancel
Save