aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java18
-rw-r--r--src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java6
-rw-r--r--src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java87
-rw-r--r--src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java36
-rw-r--r--src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java1
-rw-r--r--src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java117
6 files changed, 264 insertions, 1 deletions
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java
index d91f049364..59f83d6d04 100644
--- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java
@@ -16,12 +16,14 @@
==================================================================== */
package org.apache.poi.hssf.extractor;
+import java.io.File;
import java.io.IOException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.HSSFXML;
import org.apache.poi.hssf.usermodel.HSSFXMLCell;
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
+import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
@@ -30,6 +32,9 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
+/**
+ * Helper class to extract text from an OOXML Excel file
+ */
public class HXFExcelExtractor extends POIXMLTextExtractor {
private HSSFXMLWorkbook workbook;
private boolean includeSheetNames = true;
@@ -44,6 +49,19 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
super(workbook);
this.workbook = workbook;
}
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" HXFExcelExtractor <filename.xlsx>");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new HXFExcelExtractor(HXFDocument.openPackage(
+ new File(args[0])
+ ));
+ System.out.println(extractor.getText());
+ }
/**
* Should sheet names be included? Default is true
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java
index 0e85e96791..16b93f61f4 100644
--- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java
@@ -19,6 +19,12 @@ package org.apache.poi.hssf.usermodel;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.HSSFXML;
+/**
+ * High level representation of a ooxml workbook.
+ * This is the first object most users will construct whether
+ * they are reading or writing a workbook. It is also the
+ * top level object for creating new sheets/etc.
+ */
public class HSSFXMLWorkbook extends POIXMLDocument {
private HSSFXML hssfXML;
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java
new file mode 100644
index 0000000000..a4427e49ec
--- /dev/null
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java
@@ -0,0 +1,87 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+
+/**
+ * Helper class to extract text from an OOXML Word file
+ */
+public class HXFWordExtractor extends POIXMLTextExtractor {
+ private HWPFXMLDocument document;
+
+ public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+ this(new HWPFXMLDocument(
+ new HWPFXML(container)
+ ));
+ }
+ public HXFWordExtractor(HWPFXMLDocument document) {
+ super(document);
+ this.document = document;
+ }
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" HXFWordExtractor <filename.xlsx>");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new HXFWordExtractor(HXFDocument.openPackage(
+ new File(args[0])
+ ));
+ System.out.println(extractor.getText());
+ }
+
+ public String getText() {
+ CTBody body = document._getHWPFXML().getDocumentBody();
+ StringBuffer text = new StringBuffer();
+
+ // Loop over paragraphs
+ CTP[] ps = body.getPArray();
+ for (int i = 0; i < ps.length; i++) {
+ // Loop over ranges
+ CTR[] rs = ps[i].getRArray();
+ for (int j = 0; j < rs.length; j++) {
+ // Loop over text runs
+ CTText[] texts = rs[j].getTArray();
+ for (int k = 0; k < texts.length; k++) {
+ text.append(
+ texts[k].getStringValue()
+ );
+ }
+ }
+ // New line after each paragraph.
+ text.append("\n");
+ }
+
+ return text.toString();
+ }
+}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java
new file mode 100644
index 0000000000..64597e83dc
--- /dev/null
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java
@@ -0,0 +1,36 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.hwpf.HWPFXML;
+
+/**
+ * High level representation of a ooxml text document.
+ */
+public class HWPFXMLDocument extends POIXMLDocument {
+ private HWPFXML hwpfXML;
+
+ public HWPFXMLDocument(HWPFXML xml) {
+ super(xml);
+ this.hwpfXML = xml;
+ }
+
+ public HWPFXML _getHWPFXML() {
+ return hwpfXML;
+ }
+}
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java
index 53f2b025ea..78209b9e1e 100644
--- a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java
+++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java
@@ -18,7 +18,6 @@ package org.apache.poi.hwpf;
import java.io.File;
-import org.apache.poi.hssf.HSSFXML;
import org.apache.poi.hxf.HXFDocument;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java
new file mode 100644
index 0000000000..62695b3a8c
--- /dev/null
+++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java
@@ -0,0 +1,117 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFWordExtractor
+ */
+public class TestHXFWordExtractor extends TestCase {
+ /**
+ * A very simple file
+ */
+ private HWPFXML xmlA;
+ /**
+ * A fairly complex file
+ */
+ private HWPFXML xmlB;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ File fileA = new File(
+ System.getProperty("HWPF.testdata.path") +
+ File.separator + "sample.docx"
+ );
+ File fileB = new File(
+ System.getProperty("HWPF.testdata.path") +
+ File.separator + "IllustrativeCases.docx"
+ );
+
+ xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
+ xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
+ }
+
+ /**
+ * Get text out of the simple file
+ */
+ public void testGetSimpleText() throws Exception {
+ new HXFWordExtractor(xmlA.getPackage());
+ new HXFWordExtractor(new HWPFXMLDocument(xmlA));
+
+ HXFWordExtractor extractor =
+ new HXFWordExtractor(xmlA.getPackage());
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Check contents
+ assertTrue(text.startsWith(
+ "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
+ ));
+ assertTrue(text.endsWith(
+ "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
+ ));
+
+ // Check number of paragraphs
+ int ps = 0;
+ char[] t = text.toCharArray();
+ for (int i = 0; i < t.length; i++) {
+ if(t[i] == '\n') { ps++; }
+ }
+ assertEquals(3, ps);
+ }
+
+ /**
+ * Tests getting the text out of a complex file
+ */
+ public void testGetComplexText() throws Exception {
+ HXFWordExtractor extractor =
+ new HXFWordExtractor(xmlB.getPackage());
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ char euro = '\u20ac';
+ System.err.println("'"+text.substring(text.length() - 20) + "'");
+
+ // Check contents
+ assertTrue(text.startsWith(
+ " \n(V) ILLUSTRATIVE CASES\n\n"
+ ));
+ assertTrue(text.endsWith(
+ "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
+ ));
+
+ // Check number of paragraphs
+ int ps = 0;
+ char[] t = text.toCharArray();
+ for (int i = 0; i < t.length; i++) {
+ if(t[i] == '\n') { ps++; }
+ }
+ assertEquals(79, ps);
+ }
+}