import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
import java.util.Map;
import org.apache.poi.POIXMLException;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.model.ThemesTable;
+import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbook;
return null;
}
+ /**
+ * Returns the shapes associated with this sheet,
+ * an empty list or null if there is an exception
+ */
+ public List<XSSFShape> getShapes() {
+ PackagePart sheetPkg = getSheetPart();
+ List<XSSFShape> shapes= new LinkedList<XSSFShape>();
+ // Do we have a comments relationship? (Only ever one if so)
+ try {
+ PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation());
+ for (int i = 0; i < drawingsList.size(); i++){
+ PackageRelationship drawings = drawingsList.getRelationship(i);
+ PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI());
+ PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName);
+ XSSFDrawing drawing = new XSSFDrawing(drawingsPart, drawings);
+ for (XSSFShape shape : drawing.getShapes()){
+ shapes.add(shape);
+ }
+ }
+ } catch (XmlException e){
+ return null;
+ } catch (InvalidFormatException e) {
+ return null;
+ } catch (IOException e) {
+ return null;
+ }
+ return shapes;
+ }
+
public PackagePart getSheetPart() {
String sheetId = ctSheet.getId();
return sheetMap.get(sheetId);
import java.io.IOException;
import java.io.InputStream;
+import java.util.List;
import java.util.Locale;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
private Locale locale;
private boolean includeSheetNames = true;
private boolean formulasNotResults = false;
+ private boolean includeTextBoxes = true;
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(OPCPackage.open(path));
public void setFormulasNotResults(boolean formulasNotResults) {
this.formulasNotResults = formulasNotResults;
}
+
+ /**
+ * Should text from textboxes be included? Default is true
+ */
+
+ public void setIncludeTextBoxes(boolean includeTextBoxes) {
+ this.includeTextBoxes = includeTextBoxes;
+ }
public void setLocale(Locale locale) {
this.locale = locale;
text.append('\n');
}
processSheet(sheetExtractor, styles, strings, stream);
+ if (includeTextBoxes){
+ processShapes(iter.getShapes(), text);
+ }
stream.close();
}
}
}
- @Override
+ private void processShapes(List<XSSFShape> shapes, StringBuffer text) {
+ if (shapes == null){
+ return;
+ }
+ for (XSSFShape shape : shapes){
+ if (shape instanceof XSSFSimpleShape){
+ String sText = ((XSSFSimpleShape)shape).getText();
+ if (sText != null && sText.length() > 0){
+ text.append(sText).append('\n');
+ }
+ }
+ }
+ }
+ @Override
public void close() throws IOException {
if (container != null) {
container.close();
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSheet;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.xmlbeans.XmlException;
private boolean formulasNotResults = false;
private boolean includeCellComments = false;
private boolean includeHeadersFooters = true;
+ private boolean includeTextBoxes = true;
/**
* @deprecated Use {@link #XSSFExcelExtractor(org.apache.poi.openxml4j.opc.OPCPackage)} instead.
public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
this.includeHeadersFooters = includeHeadersFooters;
}
+ /**
+ * Should text within textboxes be included? Default is true
+ * @param includeTextBoxes
+ */
+ public void setIncludeTextBoxes(boolean includeTextBoxes){
+ this.includeTextBoxes = includeTextBoxes;
+ }
/**
* What Locale should be used for formatting numbers (based
* on the styles applied to the cells)
}
text.append("\n");
}
-
+
+ // add textboxes
+ if (includeTextBoxes){
+ XSSFDrawing drawing = sheet.createDrawingPatriarch();
+ for (XSSFShape shape : drawing.getShapes()){
+ if (shape instanceof XSSFSimpleShape){
+ String boxText = ((XSSFSimpleShape)shape).getText();
+ if (boxText.length() > 0){
+ text.append(boxText);
+ text.append('\n');
+ }
+ }
+ }
+ }
// Finally footer(s), if present
if(includeHeadersFooters) {
text.append(
* @param rel the package relationship holding this drawing,
* the relationship type must be http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing
*/
- protected XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
+ public XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
super(part, rel);
XmlOptions options = new XmlOptions(DEFAULT_XML_OPTIONS);
//Removing root element
import java.io.InputStream;
import java.util.Iterator;
+import java.util.List;
import junit.framework.TestCase;
import org.apache.poi.xssf.XSSFTestDataSamples;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.POIDataSamples;
/**
stream.close();
}
}
+ /**
+ * Test text extraction from text box using getShapes()
+ * @throws Exception
+ */
+ public void testShapes() throws Exception{
+ OPCPackage pkg = XSSFTestDataSamples.openSamplePackage("WithTextBox.xlsx");
+ XSSFReader r = new XSSFReader(pkg);
+ XSSFReader.SheetIterator it = (XSSFReader.SheetIterator)r.getSheetsData();
+
+ StringBuilder sb = new StringBuilder();
+ while(it.hasNext())
+ {
+ it.next();
+ List<XSSFShape> shapes = it.getShapes();
+ if (shapes != null){
+ for (XSSFShape shape : shapes){
+ if (shape instanceof XSSFSimpleShape){
+ String t = ((XSSFSimpleShape)shape).getText();
+ sb.append(t).append('\n');
+ }
+ }
+ }
+ }
+ String text = sb.toString();
+ assertTrue(text.indexOf("Line 1") > -1);
+ assertTrue(text.indexOf("Line 2") > -1);
+ assertTrue(text.indexOf("Line 3") > -1);
+
+ }
}
package org.apache.poi.xssf.extractor;
+import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.XSSFTestDataSamples;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
/**
* Tests for {@link XSSFEventBasedExcelExtractor}
ole2Extractor.close();
ooxmlExtractor.close();
}
+
+ /**
+ * Test text extraction from text box using getShapes()
+ * @throws Exception
+ */
+ public void testShapes() throws Exception{
+ XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("WithTextBox.xlsx");
+
+ String text = ooxmlExtractor.getText();
+
+ assertTrue(text.indexOf("Line 1") > -1);
+ assertTrue(text.indexOf("Line 2") > -1);
+ assertTrue(text.indexOf("Line 3") > -1);
+
+ }
}
extractor.close();
}
+ /**
+ * Simple test for text box text
+ * @throws IOException
+ */
+ public void testTextBoxes() throws IOException {
+ XSSFExcelExtractor extractor = getExtractor("WithTextBox.xlsx");
+ extractor.setFormulasNotResults(true);
+ String text = extractor.getText();
+ assertTrue(text.indexOf("Line 1") > -1);
+ assertTrue(text.indexOf("Line 2") > -1);
+ assertTrue(text.indexOf("Line 3") > -1);
+ }
}