]> source.dussan.org Git - poi.git/commitdiff
Make a start on a text extractor for xlsx files
authorNick Burch <nick@apache.org>
Thu, 27 Dec 2007 12:40:05 +0000 (12:40 +0000)
committerNick Burch <nick@apache.org>
Thu, 27 Dec 2007 12:40:05 +0000 (12:40 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607058 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/ooxml-src/org/apache/poi/POIXMLDocument.java
src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java [new file with mode: 0644]
src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java [new file with mode: 0644]
src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java [new file with mode: 0644]
src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java [new file with mode: 0644]

index a070e9f080013c755630f1fbec6c9b0f37afc720..36a2e8ffb317c2b4d79762d260755d4ebbf4a809 100644 (file)
@@ -16,6 +16,8 @@
 ==================================================================== */
 package org.apache.poi;
 
+import org.apache.poi.hxf.HXFDocument;
+
 /** 
  * Parent class of all UserModel POI XML (ooxml) 
  *  implementations.
@@ -23,5 +25,9 @@ package org.apache.poi;
  *  for the XML based classes.
  */
 public abstract class POIXMLDocument {
-       // TODO
+       private HXFDocument document;
+
+       protected POIXMLDocument(HXFDocument document) {
+               this.document = document;
+       }
 }
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java
new file mode 100644 (file)
index 0000000..c28eba4
--- /dev/null
@@ -0,0 +1,31 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi;
+
+public abstract class POIXMLTextExtractor extends POITextExtractor {
+       /** The POIXMLDocument that's open */
+       protected POIXMLDocument document;
+
+       /**
+        * Creates a new text extractor for the given document
+        */
+       public POIXMLTextExtractor(POIXMLDocument document) {
+               super(null);
+               
+               this.document = document;
+       }
+}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java
new file mode 100644 (file)
index 0000000..29dcc21
--- /dev/null
@@ -0,0 +1,113 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hssf.HSSFXML;
+import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
+import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCellFormula;
+import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
+import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
+import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
+
+public class HXFExcelExtractor extends POIXMLTextExtractor {
+       private HSSFXMLWorkbook workbook;
+       private boolean includeSheetNames = true;
+       private boolean formulasNotResults = false;
+       
+       public HXFExcelExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+               this(new HSSFXMLWorkbook(
+                               new HSSFXML(container)
+               ));
+       }
+       public HXFExcelExtractor(HSSFXMLWorkbook workbook) {
+               super(workbook);
+               this.workbook = workbook;
+       }
+
+       /**
+        * Should sheet names be included? Default is true
+        */
+       public void setIncludeSheetNames(boolean includeSheetNames) {
+               this.includeSheetNames = includeSheetNames;
+       }
+       /**
+        * Should we return the formula itself, and not
+        *  the result it produces? Default is false
+        */
+       public void setFormulasNotResults(boolean formulasNotResults) {
+               this.formulasNotResults = formulasNotResults;
+       }
+       
+       /**
+        * Retreives the text contents of the file
+        */
+       public String getText() {
+               StringBuffer text = new StringBuffer();
+               
+               CTSheet[] sheetRefs =
+                       workbook._getHSSFXML().getSheetReferences().getSheetArray();
+               for(int i=0; i<sheetRefs.length; i++) {
+                       try {
+                               CTWorksheet sheet =
+                                       workbook._getHSSFXML().getSheet(sheetRefs[i]);
+                               CTRow[] rows =
+                                       sheet.getSheetData().getRowArray();
+                               
+                               if(i > 0) {
+                                       text.append("\n");
+                               }
+                               if(includeSheetNames) {
+                                       text.append(sheetRefs[i].getName() + "\n");
+                               }
+                               
+                               for(int j=0; j<rows.length; j++) {
+                                       CTCell[] cells = rows[j].getCArray();
+                                       for(int k=0; k<cells.length; k++) {
+                                               CTCell cell = cells[k];
+                                               if(k > 0) {
+                                                       text.append("\t");
+                                               }
+                                               
+                                               // Is it a formula one?
+                                               if(cell.getF() != null) {
+                                                       if(formulasNotResults) {
+                                                               text.append(cell.getF().getStringValue());
+                                                       } else {
+                                                               text.append(cell.getV());
+                                                       }
+                                               } else {
+                                                       // Probably just want the v value
+                                                       text.append(cell.getV());
+                                               }
+                                       }
+                                       text.append("\n");
+                               }
+                       } catch(Exception e) {
+                               throw new RuntimeException(e);
+                       }
+               }
+               
+               return text.toString();
+       }
+}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java
new file mode 100644 (file)
index 0000000..0e85e96
--- /dev/null
@@ -0,0 +1,33 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.usermodel;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.hssf.HSSFXML;
+
+public class HSSFXMLWorkbook extends POIXMLDocument {
+       private HSSFXML hssfXML;
+       
+       public HSSFXMLWorkbook(HSSFXML xml) {
+               super(xml);
+               this.hssfXML = xml;
+       }
+       
+       public HSSFXML _getHSSFXML() {
+               return hssfXML;
+       }
+}
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java
new file mode 100644 (file)
index 0000000..fafca34
--- /dev/null
@@ -0,0 +1,75 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hssf.HSSFXML;
+import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFExcelExtractor
+ */
+public class TestHXFExcelExtractor extends TestCase {
+       /**
+        * A very simple file
+        */
+       private HSSFXML xmlA;
+       /**
+        * A fairly complex file
+        */
+       private HSSFXML xmlB;
+
+       protected void setUp() throws Exception {
+               super.setUp();
+               
+               File fileA = new File(
+                               System.getProperty("HSSF.testdata.path") +
+                               File.separator + "sample.xlsx"
+               );
+               File fileB = new File(
+                               System.getProperty("HSSF.testdata.path") +
+                               File.separator + "AverageTaxRates.xlsx"
+               );
+               
+               xmlA = new HSSFXML(HXFDocument.openPackage(fileA));
+               xmlB = new HSSFXML(HXFDocument.openPackage(fileB));
+       }
+
+       /**
+        * Get text out of the simple file
+        */
+       public void testGetSimpleText() throws Exception {
+               new HXFExcelExtractor(xmlA.getPackage());
+               new HXFExcelExtractor(new HSSFXMLWorkbook(xmlA));
+               
+               HXFExcelExtractor extractor = 
+                       new HXFExcelExtractor(xmlA.getPackage());
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               System.err.println(text);
+               
+               // Check sheet names
+               assertTrue(text.startsWith("Sheet1"));
+               assertTrue(text.endsWith("Sheet3\n"));
+       }
+}