diff options
author | Nick Burch <nick@apache.org> | 2008-03-08 17:21:29 +0000 |
---|---|---|
committer | Nick Burch <nick@apache.org> | 2008-03-08 17:21:29 +0000 |
commit | 2cc22cb45b19b057f6061feac08d18fdc0504d31 (patch) | |
tree | 510ce2c0248e7d22d0b8e576395859237c954ba2 /src/ooxml | |
parent | 4eb4e8eeef6eb96b869943ee5bb2ef4f89dd89df (diff) | |
download | poi-2cc22cb45b19b057f6061feac08d18fdc0504d31.tar.gz poi-2cc22cb45b19b057f6061feac08d18fdc0504d31.zip |
More shuffling of things out of src/scratchpad/ooxml-*
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@635021 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/ooxml')
4 files changed, 393 insertions, 0 deletions
diff --git a/src/ooxml/java/org/apache/poi/POIXMLDocument.java b/src/ooxml/java/org/apache/poi/POIXMLDocument.java index a9cceb525a..54b92e32de 100644 --- a/src/ooxml/java/org/apache/poi/POIXMLDocument.java +++ b/src/ooxml/java/org/apache/poi/POIXMLDocument.java @@ -17,7 +17,11 @@ package org.apache.poi; import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; +import org.apache.poi.poifs.common.POIFSConstants; +import org.apache.poi.util.IOUtils; import org.openxml4j.exceptions.InvalidFormatException; import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.opc.Package; diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java new file mode 100644 index 0000000000..69361e7b4b --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java @@ -0,0 +1,128 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xssf.extractor; + +import java.io.File; +import java.io.IOException; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet; + +/** + * Helper class to extract text from an OOXML Excel file + */ +public class XSSFExcelExtractor extends POIXMLTextExtractor { + private XSSFWorkbook workbook; + private boolean includeSheetNames = true; + private boolean formulasNotResults = false; + + public XSSFExcelExtractor(Package container) throws XmlException, OpenXML4JException, IOException { + this(new XSSFWorkbook(container)); + } + public XSSFExcelExtractor(XSSFWorkbook workbook) { + super(workbook); + this.workbook = workbook; + } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" HXFExcelExtractor <filename.xlsx>"); + System.exit(1); + } + POIXMLTextExtractor extractor = + new HXFExcelExtractor(HXFDocument.openPackage( + new File(args[0]) + )); + System.out.println(extractor.getText()); + } + + /** + * Should sheet names be included? Default is true + */ + public void setIncludeSheetNames(boolean includeSheetNames) { + this.includeSheetNames = includeSheetNames; + } + /** + * Should we return the formula itself, and not + * the result it produces? Default is false + */ + public void setFormulasNotResults(boolean formulasNotResults) { + this.formulasNotResults = formulasNotResults; + } + + /** + * Retreives the text contents of the file + */ + public String getText() { + StringBuffer text = new StringBuffer(); + + CTSheet[] sheetRefs = + workbook._getHSSFXML().getSheetReferences().getSheetArray(); + for(int i=0; i<sheetRefs.length; i++) { + try { + CTWorksheet sheet = + workbook._getHSSFXML().getSheet(sheetRefs[i]); + CTRow[] rows = + sheet.getSheetData().getRowArray(); + + if(i > 0) { + text.append("\n"); + } + if(includeSheetNames) { + text.append(sheetRefs[i].getName() + "\n"); + } + + for(int j=0; j<rows.length; j++) { + CTCell[] cells = rows[j].getCArray(); + for(int k=0; k<cells.length; k++) { + CTCell cell = cells[k]; + if(k > 0) { + text.append("\t"); + } + + boolean done = false; + + // Is it a formula one? + if(cell.getF() != null) { + if(formulasNotResults) { + text.append(cell.getF().getStringValue()); + done = true; + } + } + if(!done) { + HSSFXMLCell uCell = new HSSFXMLCell(cell, workbook); + text.append(uCell.getStringValue()); + } + } + text.append("\n"); + } + } catch(Exception e) { + throw new RuntimeException(e); + } + } + + return text.toString(); + } +} diff --git a/src/ooxml/testcases/org/apache/poi/TestDetectAsOOXML.java b/src/ooxml/testcases/org/apache/poi/TestDetectAsOOXML.java new file mode 100644 index 0000000000..36adb497cd --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/TestDetectAsOOXML.java @@ -0,0 +1,65 @@ + +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + + +package org.apache.poi.hxf; + +import junit.framework.TestCase; +import java.io.*; + +/** + * Class to test that HXF correctly detects OOXML + * documents + */ +public class TestDetectAsOOXML extends TestCase +{ + public String dirname; + + public void setUp() { + dirname = System.getProperty("HSSF.testdata.path"); + } + + public void testOpensProperly() throws Exception + { + File f = new File(dirname + "/sample.xlsx"); + + HXFDocument.openPackage(f); + } + + public void testDetectAsPOIFS() throws Exception { + InputStream in; + + // ooxml file is + in = new PushbackInputStream( + new FileInputStream(dirname + "/SampleSS.xlsx"), 10 + ); + assertTrue(HXFDocument.hasOOXMLHeader(in)); + + // xls file isn't + in = new PushbackInputStream( + new FileInputStream(dirname + "/SampleSS.xls"), 10 + ); + assertFalse(HXFDocument.hasOOXMLHeader(in)); + + // text file isn't + in = new PushbackInputStream( + new FileInputStream(dirname + "/SampleSS.txt"), 10 + ); + assertFalse(HXFDocument.hasOOXMLHeader(in)); + } +} diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TextXSSFExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TextXSSFExcelExtractor.java new file mode 100644 index 0000000000..a73b60bf72 --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TextXSSFExcelExtractor.java @@ -0,0 +1,196 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.extractor; + +import java.io.File; +import java.io.FileInputStream; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import junit.framework.TestCase; + +import org.apache.poi.POITextExtractor; +import org.apache.poi.hssf.HSSFXML; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook; +import org.apache.poi.hxf.HXFDocument; + +/** + * Tests for HXFExcelExtractor + */ +public class TestHXFExcelExtractor extends TestCase { + /** + * A very simple file + */ + private HSSFXML xmlA; + /** + * A fairly complex file + */ + private HSSFXML xmlB; + + /** + * A fairly simple file - ooxml + */ + private HSSFXML simpleXLSX; + /** + * A fairly simple file - ole2 + */ + private HSSFWorkbook simpleXLS; + + protected void setUp() throws Exception { + super.setUp(); + + File fileA = new File( + System.getProperty("HSSF.testdata.path") + + File.separator + "sample.xlsx" + ); + File fileB = new File( + System.getProperty("HSSF.testdata.path") + + File.separator + "AverageTaxRates.xlsx" + ); + + File fileSOOXML = new File( + System.getProperty("HSSF.testdata.path") + + File.separator + "SampleSS.xlsx" + ); + File fileSOLE2 = new File( + System.getProperty("HSSF.testdata.path") + + File.separator + "SampleSS.xls" + ); + + xmlA = new HSSFXML(HXFDocument.openPackage(fileA)); + xmlB = new HSSFXML(HXFDocument.openPackage(fileB)); + + simpleXLSX = new HSSFXML(HXFDocument.openPackage(fileSOOXML)); + simpleXLS = new HSSFWorkbook(new FileInputStream(fileSOLE2)); + } + + /** + * Get text out of the simple file + */ + public void testGetSimpleText() throws Exception { + new HXFExcelExtractor(xmlA.getPackage()); + new HXFExcelExtractor(new HSSFXMLWorkbook(xmlA)); + + HXFExcelExtractor extractor = + new HXFExcelExtractor(xmlA.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check sheet names + assertTrue(text.startsWith("Sheet1")); + assertTrue(text.endsWith("Sheet3\n")); + + // Now without, will have text + extractor.setIncludeSheetNames(false); + text = extractor.getText(); + assertEquals( + "Lorem\t111\n" + + "ipsum\t222\n" + + "dolor\t333\n" + + "sit\t444\n" + + "amet\t555\n" + + "consectetuer\t666\n" + + "adipiscing\t777\n" + + "elit\t888\n" + + "Nunc\t999\n" + + "at\t4995\n" + + "\n\n", text); + + // Now get formulas not their values + extractor.setFormulasNotResults(true); + text = extractor.getText(); + assertEquals( + "Lorem\t111\n" + + "ipsum\t222\n" + + "dolor\t333\n" + + "sit\t444\n" + + "amet\t555\n" + + "consectetuer\t666\n" + + "adipiscing\t777\n" + + "elit\t888\n" + + "Nunc\t999\n" + + "at\tSUM(B1:B9)\n" + + "\n\n", text); + + // With sheet names too + extractor.setIncludeSheetNames(true); + text = extractor.getText(); + assertEquals( + "Sheet1\n" + + "Lorem\t111\n" + + "ipsum\t222\n" + + "dolor\t333\n" + + "sit\t444\n" + + "amet\t555\n" + + "consectetuer\t666\n" + + "adipiscing\t777\n" + + "elit\t888\n" + + "Nunc\t999\n" + + "at\tSUM(B1:B9)\n\n" + + "Sheet2\n\n" + + "Sheet3\n" + , text); + } + + public void testGetComplexText() throws Exception { + new HXFExcelExtractor(xmlB.getPackage()); + new HXFExcelExtractor(new HSSFXMLWorkbook(xmlB)); + + HXFExcelExtractor extractor = + new HXFExcelExtractor(xmlB.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Might not have all formatting it should do! + // TODO decide if we should really have the "null" in there + assertTrue(text.startsWith( + "Avgtxfull\n" + + "null\t(iii) AVERAGE TAX RATES ON ANNUAL" + )); + } + + /** + * Test that we return pretty much the same as + * ExcelExtractor does, when we're both passed + * the same file, just saved as xls and xlsx + */ + public void testComparedToOLE2() throws Exception { + HXFExcelExtractor ooxmlExtractor = + new HXFExcelExtractor(simpleXLSX.getPackage()); + ExcelExtractor ole2Extractor = + new ExcelExtractor(simpleXLS); + + POITextExtractor[] extractors = + new POITextExtractor[] { ooxmlExtractor, ole2Extractor }; + for (int i = 0; i < extractors.length; i++) { + POITextExtractor extractor = extractors[i]; + + String text = extractor.getText().replaceAll("[\r\t]", ""); + //System.out.println(text.length()); + //System.out.println(text); + assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n")); + Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL); + Matcher m = pattern.matcher(text); + assertTrue(m.matches()); + } + } +} |