<document>
<header>
- <title>Busy Developers' Guide to HSSF Features</title>
+ <title>Busy Developers' Guide to HSSF and XSSF Features</title>
<authors>
<person email="user@poi.apache.org" name="Glen Stampoultzis" id="CO"/>
<person email="user@poi.apache.org" name="Yegor Kozlov" id="YK"/>
<body>
<section><title>Busy Developers' Guide to Features</title>
<p>
- Want to use HSSF read and write spreadsheets in a hurry? This guide is for you. If you're after
- more in-depth coverage of the HSSF user-API please consult the <link href="how-to.html">HOWTO</link>
+ Want to use HSSF and XSSF read and write spreadsheets in a hurry? This
+ guide is for you. If you're after more in-depth coverage of the HSSF and
+ XSSF user-APIs, please consult the <link href="how-to.html">HOWTO</link>
guide as it contains actual descriptions of how to use this stuff.
</p>
<section><title>Index of Features</title>
protected POIXMLDocument() {}
+ protected POIXMLDocument(Package pkg) throws IOException {
+ try {
+ this.pkg = pkg;
+
+ PackageRelationship coreDocRelationship = this.pkg.getRelationshipsByType(
+ PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
+
+ // Get core part
+ this.corePart = this.pkg.getPart(coreDocRelationship);
+ } catch (OpenXML4JException e) {
+ throw new IOException(e.toString());
+ }
+ }
protected POIXMLDocument(String path) throws IOException {
+ this(openPackage(path));
+ }
+
+ /**
+ * Wrapper to open a package, returning an IOException
+ * in the event of a problem.
+ * Works around shortcomings in java's this() constructor calls
+ */
+ protected static Package openPackage(String path) throws IOException {
try {
- this.pkg = Package.open(path);
- PackageRelationship coreDocRelationship = this.pkg.getRelationshipsByType(
- PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
-
- // Get core part
- this.corePart = this.pkg.getPart(coreDocRelationship);
+ return Package.open(path);
} catch (InvalidFormatException e) {
throw new IOException(e.toString());
- } catch (OpenXML4JException e) {
- throw new IOException(e.toString());
}
}
import java.io.IOException;
import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
* Helper class to extract text from an OOXML Excel file
*/
public class XSSFExcelExtractor extends POIXMLTextExtractor {
- private XSSFWorkbook workbook;
+ private Workbook workbook;
private boolean includeSheetNames = true;
private boolean formulasNotResults = false;
+ public XSSFExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
+ this(new XSSFWorkbook(path));
+ }
public XSSFExcelExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new XSSFWorkbook(container));
}
System.exit(1);
}
POIXMLTextExtractor extractor =
- new HXFExcelExtractor(HXFDocument.openPackage(
- new File(args[0])
- ));
+ new XSSFExcelExtractor(args[0]);
System.out.println(extractor.getText());
}
public String getText() {
StringBuffer text = new StringBuffer();
- CTSheet[] sheetRefs =
- workbook._getHSSFXML().getSheetReferences().getSheetArray();
- for(int i=0; i<sheetRefs.length; i++) {
- try {
- CTWorksheet sheet =
- workbook._getHSSFXML().getSheet(sheetRefs[i]);
- CTRow[] rows =
- sheet.getSheetData().getRowArray();
-
- if(i > 0) {
- text.append("\n");
- }
- if(includeSheetNames) {
- text.append(sheetRefs[i].getName() + "\n");
- }
-
- for(int j=0; j<rows.length; j++) {
- CTCell[] cells = rows[j].getCArray();
- for(int k=0; k<cells.length; k++) {
- CTCell cell = cells[k];
- if(k > 0) {
- text.append("\t");
- }
-
- boolean done = false;
-
- // Is it a formula one?
- if(cell.getF() != null) {
- if(formulasNotResults) {
- text.append(cell.getF().getStringValue());
- done = true;
- }
- }
- if(!done) {
- HSSFXMLCell uCell = new HSSFXMLCell(cell, workbook);
- text.append(uCell.getStringValue());
- }
+ for(int i=0; i<workbook.getNumberOfSheets(); i++) {
+ Sheet sheet = workbook.getSheetAt(i);
+ if(includeSheetNames) {
+ text.append(workbook.getSheetName(i) + "\n");
+ }
+
+ for (Object rawR : sheet) {
+ Row row = (Row)rawR;
+ for (Object rawC: row) {
+ Cell cell = (Cell)rawC;
+
+ // Is it a formula one?
+ if(cell.getCellType() == Cell.CELL_TYPE_FORMULA && formulasNotResults) {
+ text.append(cell.getCellFormula());
+ } else {
+ text.append(cell.toString());
}
- text.append("\n");
+
+ text.append(",");
}
- } catch(Exception e) {
- throw new RuntimeException(e);
+ text.append("\n");
}
}
}
public XSSFWorkbook(String path) throws IOException {
- super(path);
+ this(openPackage(path));
+ }
+ public XSSFWorkbook(Package pkg) throws IOException {
+ super(pkg);
try {
WorkbookDocument doc = WorkbookDocument.Factory.parse(getCorePart().getInputStream());
this.workbook = doc.getWorkbook();
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xssf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+/**
+ * Tests for XSSFExcelExtractor
+ */
+public class TestXSSFExcelExtractor extends TestCase {
+ /**
+ * A very simple file
+ */
+ private XSSFWorkbook xmlA;
+ private File fileA;
+ /**
+ * A fairly complex file
+ */
+ private XSSFWorkbook xmlB;
+
+ /**
+ * A fairly simple file - ooxml
+ */
+ private XSSFWorkbook simpleXLSX;
+ /**
+ * A fairly simple file - ole2
+ */
+ private HSSFWorkbook simpleXLS;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ fileA = new File(
+ System.getProperty("HSSF.testdata.path") +
+ File.separator + "sample.xlsx"
+ );
+ File fileB = new File(
+ System.getProperty("HSSF.testdata.path") +
+ File.separator + "AverageTaxRates.xlsx"
+ );
+
+ File fileSOOXML = new File(
+ System.getProperty("HSSF.testdata.path") +
+ File.separator + "SampleSS.xlsx"
+ );
+ File fileSOLE2 = new File(
+ System.getProperty("HSSF.testdata.path") +
+ File.separator + "SampleSS.xls"
+ );
+
+ xmlA = new XSSFWorkbook(fileA.toString());
+ xmlB = new XSSFWorkbook(fileB.toString());
+
+ simpleXLSX = new XSSFWorkbook(fileSOOXML.toString());
+ simpleXLS = new HSSFWorkbook(new FileInputStream(fileSOLE2));
+ }
+
+ /**
+ * Get text out of the simple file
+ */
+ public void testGetSimpleText() throws Exception {
+ new XSSFExcelExtractor(fileA.toString());
+ new XSSFExcelExtractor(xmlA);
+
+ XSSFExcelExtractor extractor =
+ new XSSFExcelExtractor(xmlA);
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Check sheet names
+ assertTrue(text.startsWith("Sheet1"));
+ assertTrue(text.endsWith("Sheet3\n"));
+
+ // Now without, will have text
+ extractor.setIncludeSheetNames(false);
+ text = extractor.getText();
+ assertEquals(
+ "Lorem\t111\n" +
+ "ipsum\t222\n" +
+ "dolor\t333\n" +
+ "sit\t444\n" +
+ "amet\t555\n" +
+ "consectetuer\t666\n" +
+ "adipiscing\t777\n" +
+ "elit\t888\n" +
+ "Nunc\t999\n" +
+ "at\t4995\n" +
+ "\n\n", text);
+
+ // Now get formulas not their values
+ extractor.setFormulasNotResults(true);
+ text = extractor.getText();
+ assertEquals(
+ "Lorem\t111\n" +
+ "ipsum\t222\n" +
+ "dolor\t333\n" +
+ "sit\t444\n" +
+ "amet\t555\n" +
+ "consectetuer\t666\n" +
+ "adipiscing\t777\n" +
+ "elit\t888\n" +
+ "Nunc\t999\n" +
+ "at\tSUM(B1:B9)\n" +
+ "\n\n", text);
+
+ // With sheet names too
+ extractor.setIncludeSheetNames(true);
+ text = extractor.getText();
+ assertEquals(
+ "Sheet1\n" +
+ "Lorem\t111\n" +
+ "ipsum\t222\n" +
+ "dolor\t333\n" +
+ "sit\t444\n" +
+ "amet\t555\n" +
+ "consectetuer\t666\n" +
+ "adipiscing\t777\n" +
+ "elit\t888\n" +
+ "Nunc\t999\n" +
+ "at\tSUM(B1:B9)\n\n" +
+ "Sheet2\n\n" +
+ "Sheet3\n"
+ , text);
+ }
+
+ public void testGetComplexText() throws Exception {
+ new XSSFExcelExtractor(xmlB);
+
+ XSSFExcelExtractor extractor =
+ new XSSFExcelExtractor(xmlB);
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Might not have all formatting it should do!
+ // TODO decide if we should really have the "null" in there
+ assertTrue(text.startsWith(
+ "Avgtxfull\n" +
+ "null\t(iii) AVERAGE TAX RATES ON ANNUAL"
+ ));
+ }
+
+ /**
+ * Test that we return pretty much the same as
+ * ExcelExtractor does, when we're both passed
+ * the same file, just saved as xls and xlsx
+ */
+ public void testComparedToOLE2() throws Exception {
+ XSSFExcelExtractor ooxmlExtractor =
+ new XSSFExcelExtractor(simpleXLSX);
+ ExcelExtractor ole2Extractor =
+ new ExcelExtractor(simpleXLS);
+
+ POITextExtractor[] extractors =
+ new POITextExtractor[] { ooxmlExtractor, ole2Extractor };
+ for (int i = 0; i < extractors.length; i++) {
+ POITextExtractor extractor = extractors[i];
+
+ String text = extractor.getText().replaceAll("[\r\t]", "");
+ //System.out.println(text.length());
+ //System.out.println(text);
+ assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
+ Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
+ Matcher m = pattern.matcher(text);
+ assertTrue(m.matches());
+ }
+ }
+}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hssf.extractor;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import junit.framework.TestCase;
-
-import org.apache.poi.POITextExtractor;
-import org.apache.poi.hssf.HSSFXML;
-import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
-import org.apache.poi.hxf.HXFDocument;
-
-/**
- * Tests for HXFExcelExtractor
- */
-public class TestHXFExcelExtractor extends TestCase {
- /**
- * A very simple file
- */
- private HSSFXML xmlA;
- /**
- * A fairly complex file
- */
- private HSSFXML xmlB;
-
- /**
- * A fairly simple file - ooxml
- */
- private HSSFXML simpleXLSX;
- /**
- * A fairly simple file - ole2
- */
- private HSSFWorkbook simpleXLS;
-
- protected void setUp() throws Exception {
- super.setUp();
-
- File fileA = new File(
- System.getProperty("HSSF.testdata.path") +
- File.separator + "sample.xlsx"
- );
- File fileB = new File(
- System.getProperty("HSSF.testdata.path") +
- File.separator + "AverageTaxRates.xlsx"
- );
-
- File fileSOOXML = new File(
- System.getProperty("HSSF.testdata.path") +
- File.separator + "SampleSS.xlsx"
- );
- File fileSOLE2 = new File(
- System.getProperty("HSSF.testdata.path") +
- File.separator + "SampleSS.xls"
- );
-
- xmlA = new HSSFXML(HXFDocument.openPackage(fileA));
- xmlB = new HSSFXML(HXFDocument.openPackage(fileB));
-
- simpleXLSX = new HSSFXML(HXFDocument.openPackage(fileSOOXML));
- simpleXLS = new HSSFWorkbook(new FileInputStream(fileSOLE2));
- }
-
- /**
- * Get text out of the simple file
- */
- public void testGetSimpleText() throws Exception {
- new HXFExcelExtractor(xmlA.getPackage());
- new HXFExcelExtractor(new HSSFXMLWorkbook(xmlA));
-
- HXFExcelExtractor extractor =
- new HXFExcelExtractor(xmlA.getPackage());
- extractor.getText();
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check sheet names
- assertTrue(text.startsWith("Sheet1"));
- assertTrue(text.endsWith("Sheet3\n"));
-
- // Now without, will have text
- extractor.setIncludeSheetNames(false);
- text = extractor.getText();
- assertEquals(
- "Lorem\t111\n" +
- "ipsum\t222\n" +
- "dolor\t333\n" +
- "sit\t444\n" +
- "amet\t555\n" +
- "consectetuer\t666\n" +
- "adipiscing\t777\n" +
- "elit\t888\n" +
- "Nunc\t999\n" +
- "at\t4995\n" +
- "\n\n", text);
-
- // Now get formulas not their values
- extractor.setFormulasNotResults(true);
- text = extractor.getText();
- assertEquals(
- "Lorem\t111\n" +
- "ipsum\t222\n" +
- "dolor\t333\n" +
- "sit\t444\n" +
- "amet\t555\n" +
- "consectetuer\t666\n" +
- "adipiscing\t777\n" +
- "elit\t888\n" +
- "Nunc\t999\n" +
- "at\tSUM(B1:B9)\n" +
- "\n\n", text);
-
- // With sheet names too
- extractor.setIncludeSheetNames(true);
- text = extractor.getText();
- assertEquals(
- "Sheet1\n" +
- "Lorem\t111\n" +
- "ipsum\t222\n" +
- "dolor\t333\n" +
- "sit\t444\n" +
- "amet\t555\n" +
- "consectetuer\t666\n" +
- "adipiscing\t777\n" +
- "elit\t888\n" +
- "Nunc\t999\n" +
- "at\tSUM(B1:B9)\n\n" +
- "Sheet2\n\n" +
- "Sheet3\n"
- , text);
- }
-
- public void testGetComplexText() throws Exception {
- new HXFExcelExtractor(xmlB.getPackage());
- new HXFExcelExtractor(new HSSFXMLWorkbook(xmlB));
-
- HXFExcelExtractor extractor =
- new HXFExcelExtractor(xmlB.getPackage());
- extractor.getText();
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Might not have all formatting it should do!
- // TODO decide if we should really have the "null" in there
- assertTrue(text.startsWith(
- "Avgtxfull\n" +
- "null\t(iii) AVERAGE TAX RATES ON ANNUAL"
- ));
- }
-
- /**
- * Test that we return pretty much the same as
- * ExcelExtractor does, when we're both passed
- * the same file, just saved as xls and xlsx
- */
- public void testComparedToOLE2() throws Exception {
- HXFExcelExtractor ooxmlExtractor =
- new HXFExcelExtractor(simpleXLSX.getPackage());
- ExcelExtractor ole2Extractor =
- new ExcelExtractor(simpleXLS);
-
- POITextExtractor[] extractors =
- new POITextExtractor[] { ooxmlExtractor, ole2Extractor };
- for (int i = 0; i < extractors.length; i++) {
- POITextExtractor extractor = extractors[i];
-
- String text = extractor.getText().replaceAll("[\r\t]", "");
- //System.out.println(text.length());
- //System.out.println(text);
- assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
- Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
- Matcher m = pattern.matcher(text);
- assertTrue(m.matches());
- }
- }
-}