Browse Source

55347 - integrate textbox text extraction with Excel extractors

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1511789 13f79535-47bb-0310-9956-ffa450edef68
tags/REL_3_10_BETA2
Tim Allison 11 years ago
parent
commit
8f68884358

+ 33
- 0
src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFReader.java View File

@@ -21,6 +21,8 @@ import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.poi.POIXMLException;
@@ -37,7 +39,9 @@ import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.model.ThemesTable;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbook;
@@ -273,6 +277,35 @@ public class XSSFReader {
return null;
}
/**
* Returns the shapes associated with this sheet,
* an empty list or null if there is an exception
*/
public List<XSSFShape> getShapes() {
PackagePart sheetPkg = getSheetPart();
List<XSSFShape> shapes= new LinkedList<XSSFShape>();
// Do we have a comments relationship? (Only ever one if so)
try {
PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation());
for (int i = 0; i < drawingsList.size(); i++){
PackageRelationship drawings = drawingsList.getRelationship(i);
PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI());
PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName);
XSSFDrawing drawing = new XSSFDrawing(drawingsPart, drawings);
for (XSSFShape shape : drawing.getShapes()){
shapes.add(shape);
}
}
} catch (XmlException e){
return null;
} catch (InvalidFormatException e) {
return null;
} catch (IOException e) {
return null;
}
return shapes;
}
public PackagePart getSheetPart() {
String sheetId = ctSheet.getId();
return sheetMap.get(sheetId);

+ 29
- 1
src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java View File

@@ -18,6 +18,7 @@ package org.apache.poi.xssf.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Locale;

import javax.xml.parsers.ParserConfigurationException;
@@ -37,6 +38,8 @@ import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
@@ -54,6 +57,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
private Locale locale;
private boolean includeSheetNames = true;
private boolean formulasNotResults = false;
private boolean includeTextBoxes = true;

public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(OPCPackage.open(path));
@@ -89,6 +93,14 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
public void setFormulasNotResults(boolean formulasNotResults) {
this.formulasNotResults = formulasNotResults;
}

/**
* Should text from textboxes be included? Default is true
*/

public void setIncludeTextBoxes(boolean includeTextBoxes) {
this.includeTextBoxes = includeTextBoxes;
}
public void setLocale(Locale locale) {
this.locale = locale;
@@ -175,6 +187,9 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
text.append('\n');
}
processSheet(sheetExtractor, styles, strings, stream);
if (includeTextBoxes){
processShapes(iter.getShapes(), text);
}
stream.close();
}
@@ -191,7 +206,20 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
}
}
@Override
private void processShapes(List<XSSFShape> shapes, StringBuffer text) {
if (shapes == null){
return;
}
for (XSSFShape shape : shapes){
if (shape instanceof XSSFSimpleShape){
String sText = ((XSSFSimpleShape)shape).getText();
if (sText != null && sText.length() > 0){
text.append(sText).append('\n');
}
}
}
}
@Override
public void close() throws IOException {
if (container != null) {
container.close();

+ 25
- 1
src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java View File

@@ -31,8 +31,11 @@ import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.xmlbeans.XmlException;

@@ -52,6 +55,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
private boolean formulasNotResults = false;
private boolean includeCellComments = false;
private boolean includeHeadersFooters = true;
private boolean includeTextBoxes = true;

/**
* @deprecated Use {@link #XSSFExcelExtractor(org.apache.poi.openxml4j.opc.OPCPackage)} instead.
@@ -103,6 +107,13 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
this.includeHeadersFooters = includeHeadersFooters;
}
/**
* Should text within textboxes be included? Default is true
* @param includeTextBoxes
*/
public void setIncludeTextBoxes(boolean includeTextBoxes){
this.includeTextBoxes = includeTextBoxes;
}
/**
* What Locale should be used for formatting numbers (based
* on the styles applied to the cells)
@@ -180,7 +191,20 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
}
text.append("\n");
}

// add textboxes
if (includeTextBoxes){
XSSFDrawing drawing = sheet.createDrawingPatriarch();
for (XSSFShape shape : drawing.getShapes()){
if (shape instanceof XSSFSimpleShape){
String boxText = ((XSSFSimpleShape)shape).getText();
if (boxText.length() > 0){
text.append(boxText);
text.append('\n');
}
}
}
}
// Finally footer(s), if present
if(includeHeadersFooters) {
text.append(

+ 1
- 1
src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java View File

@@ -76,7 +76,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
* @param rel the package relationship holding this drawing,
* the relationship type must be http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing
*/
protected XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
public XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
super(part, rel);
XmlOptions options = new XmlOptions(DEFAULT_XML_OPTIONS);
//Removing root element

+ 32
- 0
src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestXSSFReader.java View File

@@ -19,6 +19,7 @@ package org.apache.poi.xssf.eventusermodel;

import java.io.InputStream;
import java.util.Iterator;
import java.util.List;

import junit.framework.TestCase;

@@ -27,6 +28,8 @@ import org.apache.poi.util.IOUtils;
import org.apache.poi.xssf.XSSFTestDataSamples;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.POIDataSamples;

/**
@@ -164,4 +167,33 @@ public final class TestXSSFReader extends TestCase {
stream.close();
}
}
/**
* Test text extraction from text box using getShapes()
* @throws Exception
*/
public void testShapes() throws Exception{
OPCPackage pkg = XSSFTestDataSamples.openSamplePackage("WithTextBox.xlsx");
XSSFReader r = new XSSFReader(pkg);
XSSFReader.SheetIterator it = (XSSFReader.SheetIterator)r.getSheetsData();
StringBuilder sb = new StringBuilder();
while(it.hasNext())
{
it.next();
List<XSSFShape> shapes = it.getShapes();
if (shapes != null){
for (XSSFShape shape : shapes){
if (shape instanceof XSSFSimpleShape){
String t = ((XSSFSimpleShape)shape).getText();
sb.append(t).append('\n');
}
}
}
}
String text = sb.toString();
assertTrue(text.indexOf("Line 1") > -1);
assertTrue(text.indexOf("Line 2") > -1);
assertTrue(text.indexOf("Line 3") > -1);

}
}

+ 20
- 0
src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java View File

@@ -17,6 +17,7 @@

package org.apache.poi.xssf.extractor;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@@ -25,7 +26,11 @@ import junit.framework.TestCase;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.XSSFTestDataSamples;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;

/**
* Tests for {@link XSSFEventBasedExcelExtractor}
@@ -167,4 +172,19 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase {
ole2Extractor.close();
ooxmlExtractor.close();
}
/**
* Test text extraction from text box using getShapes()
* @throws Exception
*/
public void testShapes() throws Exception{
XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("WithTextBox.xlsx");
String text = ooxmlExtractor.getText();

assertTrue(text.indexOf("Line 1") > -1);
assertTrue(text.indexOf("Line 2") > -1);
assertTrue(text.indexOf("Line 3") > -1);

}
}

+ 12
- 0
src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java View File

@@ -211,4 +211,16 @@ public class TestXSSFExcelExtractor extends TestCase {
extractor.close();
}
/**
* Simple test for text box text
* @throws IOException
*/
public void testTextBoxes() throws IOException {
XSSFExcelExtractor extractor = getExtractor("WithTextBox.xlsx");
extractor.setFormulasNotResults(true);
String text = extractor.getText();
assertTrue(text.indexOf("Line 1") > -1);
assertTrue(text.indexOf("Line 2") > -1);
assertTrue(text.indexOf("Line 3") > -1);
}
}

Loading…
Cancel
Save