From 5bbf6f70f623117e757548794b094c7274893b39 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Thu, 27 Dec 2007 12:40:05 +0000 Subject: [PATCH] Make a start on a text extractor for xlsx files git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607058 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/POIXMLDocument.java | 8 +- .../org/apache/poi/POIXMLTextExtractor.java | 31 +++++ .../poi/hssf/extractor/HXFExcelExtractor.java | 113 ++++++++++++++++++ .../poi/hssf/usermodel/HSSFXMLWorkbook.java | 33 +++++ .../hssf/extractor/TestHXFExcelExtractor.java | 75 ++++++++++++ 5 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java create mode 100644 src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java create mode 100644 src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java create mode 100644 src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java diff --git a/src/scratchpad/ooxml-src/org/apache/poi/POIXMLDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/POIXMLDocument.java index a070e9f080..36a2e8ffb3 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/POIXMLDocument.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/POIXMLDocument.java @@ -16,6 +16,8 @@ ==================================================================== */ package org.apache.poi; +import org.apache.poi.hxf.HXFDocument; + /** * Parent class of all UserModel POI XML (ooxml) * implementations. @@ -23,5 +25,9 @@ package org.apache.poi; * for the XML based classes. */ public abstract class POIXMLDocument { - // TODO + private HXFDocument document; + + protected POIXMLDocument(HXFDocument document) { + this.document = document; + } } diff --git a/src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java new file mode 100644 index 0000000000..c28eba49da --- /dev/null +++ b/src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java @@ -0,0 +1,31 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi; + +public abstract class POIXMLTextExtractor extends POITextExtractor { + /** The POIXMLDocument that's open */ + protected POIXMLDocument document; + + /** + * Creates a new text extractor for the given document + */ + public POIXMLTextExtractor(POIXMLDocument document) { + super(null); + + this.document = document; + } +} diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java new file mode 100644 index 0000000000..29dcc21176 --- /dev/null +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java @@ -0,0 +1,113 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.extractor; + +import java.io.IOException; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hssf.HSSFXML; +import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCellFormula; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet; + +public class HXFExcelExtractor extends POIXMLTextExtractor { + private HSSFXMLWorkbook workbook; + private boolean includeSheetNames = true; + private boolean formulasNotResults = false; + + public HXFExcelExtractor(Package container) throws XmlException, OpenXML4JException, IOException { + this(new HSSFXMLWorkbook( + new HSSFXML(container) + )); + } + public HXFExcelExtractor(HSSFXMLWorkbook workbook) { + super(workbook); + this.workbook = workbook; + } + + /** + * Should sheet names be included? Default is true + */ + public void setIncludeSheetNames(boolean includeSheetNames) { + this.includeSheetNames = includeSheetNames; + } + /** + * Should we return the formula itself, and not + * the result it produces? Default is false + */ + public void setFormulasNotResults(boolean formulasNotResults) { + this.formulasNotResults = formulasNotResults; + } + + /** + * Retreives the text contents of the file + */ + public String getText() { + StringBuffer text = new StringBuffer(); + + CTSheet[] sheetRefs = + workbook._getHSSFXML().getSheetReferences().getSheetArray(); + for(int i=0; i 0) { + text.append("\n"); + } + if(includeSheetNames) { + text.append(sheetRefs[i].getName() + "\n"); + } + + for(int j=0; j 0) { + text.append("\t"); + } + + // Is it a formula one? + if(cell.getF() != null) { + if(formulasNotResults) { + text.append(cell.getF().getStringValue()); + } else { + text.append(cell.getV()); + } + } else { + // Probably just want the v value + text.append(cell.getV()); + } + } + text.append("\n"); + } + } catch(Exception e) { + throw new RuntimeException(e); + } + } + + return text.toString(); + } +} diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java new file mode 100644 index 0000000000..0e85e96791 --- /dev/null +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java @@ -0,0 +1,33 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.usermodel; + +import org.apache.poi.POIXMLDocument; +import org.apache.poi.hssf.HSSFXML; + +public class HSSFXMLWorkbook extends POIXMLDocument { + private HSSFXML hssfXML; + + public HSSFXMLWorkbook(HSSFXML xml) { + super(xml); + this.hssfXML = xml; + } + + public HSSFXML _getHSSFXML() { + return hssfXML; + } +} diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java new file mode 100644 index 0000000000..fafca345e0 --- /dev/null +++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java @@ -0,0 +1,75 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.extractor; + +import java.io.File; + +import org.apache.poi.hssf.HSSFXML; +import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook; +import org.apache.poi.hxf.HXFDocument; + +import junit.framework.TestCase; + +/** + * Tests for HXFExcelExtractor + */ +public class TestHXFExcelExtractor extends TestCase { + /** + * A very simple file + */ + private HSSFXML xmlA; + /** + * A fairly complex file + */ + private HSSFXML xmlB; + + protected void setUp() throws Exception { + super.setUp(); + + File fileA = new File( + System.getProperty("HSSF.testdata.path") + + File.separator + "sample.xlsx" + ); + File fileB = new File( + System.getProperty("HSSF.testdata.path") + + File.separator + "AverageTaxRates.xlsx" + ); + + xmlA = new HSSFXML(HXFDocument.openPackage(fileA)); + xmlB = new HSSFXML(HXFDocument.openPackage(fileB)); + } + + /** + * Get text out of the simple file + */ + public void testGetSimpleText() throws Exception { + new HXFExcelExtractor(xmlA.getPackage()); + new HXFExcelExtractor(new HSSFXMLWorkbook(xmlA)); + + HXFExcelExtractor extractor = + new HXFExcelExtractor(xmlA.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + System.err.println(text); + + // Check sheet names + assertTrue(text.startsWith("Sheet1")); + assertTrue(text.endsWith("Sheet3\n")); + } +} -- 2.39.5