aboutsummaryrefslogtreecommitdiffstats
path: root/src/scratchpad/ooxml-src/org/apache/poi
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2007-12-30 16:53:42 +0000
committerNick Burch <nick@apache.org>2007-12-30 16:53:42 +0000
commit699dabdaf8d253e6d32938eac621794452664eae (patch)
tree8b9fe536be28ce19653901d04af01c94cbd2b52d /src/scratchpad/ooxml-src/org/apache/poi
parent9b96000af0c686768e57dacc77358f6e7be1e0c2 (diff)
downloadpoi-699dabdaf8d253e6d32938eac621794452664eae.tar.gz
poi-699dabdaf8d253e6d32938eac621794452664eae.zip
Support for extracting text from ooxml word documents
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607560 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/scratchpad/ooxml-src/org/apache/poi')
-rw-r--r--src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java18
-rw-r--r--src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java6
-rw-r--r--src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java87
-rw-r--r--src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java36
4 files changed, 147 insertions, 0 deletions
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java
index d91f049364..59f83d6d04 100644
--- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java
@@ -16,12 +16,14 @@
==================================================================== */
package org.apache.poi.hssf.extractor;
+import java.io.File;
import java.io.IOException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.HSSFXML;
import org.apache.poi.hssf.usermodel.HSSFXMLCell;
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
+import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
@@ -30,6 +32,9 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
+/**
+ * Helper class to extract text from an OOXML Excel file
+ */
public class HXFExcelExtractor extends POIXMLTextExtractor {
private HSSFXMLWorkbook workbook;
private boolean includeSheetNames = true;
@@ -44,6 +49,19 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
super(workbook);
this.workbook = workbook;
}
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" HXFExcelExtractor <filename.xlsx>");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new HXFExcelExtractor(HXFDocument.openPackage(
+ new File(args[0])
+ ));
+ System.out.println(extractor.getText());
+ }
/**
* Should sheet names be included? Default is true
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java
index 0e85e96791..16b93f61f4 100644
--- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java
@@ -19,6 +19,12 @@ package org.apache.poi.hssf.usermodel;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.HSSFXML;
+/**
+ * High level representation of a ooxml workbook.
+ * This is the first object most users will construct whether
+ * they are reading or writing a workbook. It is also the
+ * top level object for creating new sheets/etc.
+ */
public class HSSFXMLWorkbook extends POIXMLDocument {
private HSSFXML hssfXML;
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java
new file mode 100644
index 0000000000..a4427e49ec
--- /dev/null
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java
@@ -0,0 +1,87 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+
+/**
+ * Helper class to extract text from an OOXML Word file
+ */
+public class HXFWordExtractor extends POIXMLTextExtractor {
+ private HWPFXMLDocument document;
+
+ public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+ this(new HWPFXMLDocument(
+ new HWPFXML(container)
+ ));
+ }
+ public HXFWordExtractor(HWPFXMLDocument document) {
+ super(document);
+ this.document = document;
+ }
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" HXFWordExtractor <filename.xlsx>");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new HXFWordExtractor(HXFDocument.openPackage(
+ new File(args[0])
+ ));
+ System.out.println(extractor.getText());
+ }
+
+ public String getText() {
+ CTBody body = document._getHWPFXML().getDocumentBody();
+ StringBuffer text = new StringBuffer();
+
+ // Loop over paragraphs
+ CTP[] ps = body.getPArray();
+ for (int i = 0; i < ps.length; i++) {
+ // Loop over ranges
+ CTR[] rs = ps[i].getRArray();
+ for (int j = 0; j < rs.length; j++) {
+ // Loop over text runs
+ CTText[] texts = rs[j].getTArray();
+ for (int k = 0; k < texts.length; k++) {
+ text.append(
+ texts[k].getStringValue()
+ );
+ }
+ }
+ // New line after each paragraph.
+ text.append("\n");
+ }
+
+ return text.toString();
+ }
+}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java
new file mode 100644
index 0000000000..64597e83dc
--- /dev/null
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java
@@ -0,0 +1,36 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.hwpf.HWPFXML;
+
+/**
+ * High level representation of a ooxml text document.
+ */
+public class HWPFXMLDocument extends POIXMLDocument {
+ private HWPFXML hwpfXML;
+
+ public HWPFXMLDocument(HWPFXML xml) {
+ super(xml);
+ this.hwpfXML = xml;
+ }
+
+ public HWPFXML _getHWPFXML() {
+ return hwpfXML;
+ }
+}