]> source.dussan.org Git - poi.git/commitdiff
Support for extracting text from ooxml word documents
authorNick Burch <nick@apache.org>
Sun, 30 Dec 2007 16:53:42 +0000 (16:53 +0000)
committerNick Burch <nick@apache.org>
Sun, 30 Dec 2007 16:53:42 +0000 (16:53 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607560 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java
src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java
src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java [new file with mode: 0644]
src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java [new file with mode: 0644]
src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java
src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java [new file with mode: 0644]

index d91f049364cdfb1862106fc5cf19ab87b6f7b6be..59f83d6d041da2eb383bf6f765a33a40e13d1a72 100644 (file)
 ==================================================================== */
 package org.apache.poi.hssf.extractor;
 
+import java.io.File;
 import java.io.IOException;
 
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hssf.HSSFXML;
 import org.apache.poi.hssf.usermodel.HSSFXMLCell;
 import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
+import org.apache.poi.hxf.HXFDocument;
 import org.apache.xmlbeans.XmlException;
 import org.openxml4j.exceptions.OpenXML4JException;
 import org.openxml4j.opc.Package;
@@ -30,6 +32,9 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
 import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
 import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
 
+/**
+ * Helper class to extract text from an OOXML Excel file
+ */
 public class HXFExcelExtractor extends POIXMLTextExtractor {
        private HSSFXMLWorkbook workbook;
        private boolean includeSheetNames = true;
@@ -44,6 +49,19 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
                super(workbook);
                this.workbook = workbook;
        }
+       
+       public static void main(String[] args) throws Exception {
+               if(args.length < 1) {
+                       System.err.println("Use:");
+                       System.err.println("  HXFExcelExtractor <filename.xlsx>");
+                       System.exit(1);
+               }
+               POIXMLTextExtractor extractor = 
+                       new HXFExcelExtractor(HXFDocument.openPackage(
+                                       new File(args[0])
+                       ));
+               System.out.println(extractor.getText());
+       }
 
        /**
         * Should sheet names be included? Default is true
index 0e85e96791cfabee8b6ac5963409e8456fe2d11b..16b93f61f409556400f28d455cb6fd6f93ce73d9 100644 (file)
@@ -19,6 +19,12 @@ package org.apache.poi.hssf.usermodel;
 import org.apache.poi.POIXMLDocument;
 import org.apache.poi.hssf.HSSFXML;
 
+/**
+ * High level representation of a ooxml workbook.
+ * This is the first object most users will construct whether
+ *  they are reading or writing a workbook. It is also the
+ *  top level object for creating new sheets/etc.
+ */
 public class HSSFXMLWorkbook extends POIXMLDocument {
        private HSSFXML hssfXML;
        
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java
new file mode 100644 (file)
index 0000000..a4427e4
--- /dev/null
@@ -0,0 +1,87 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+
+/**
+ * Helper class to extract text from an OOXML Word file
+ */
+public class HXFWordExtractor extends POIXMLTextExtractor {
+       private HWPFXMLDocument document;
+       
+       public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+               this(new HWPFXMLDocument(
+                               new HWPFXML(container)
+               ));
+       }
+       public HXFWordExtractor(HWPFXMLDocument document) {
+               super(document);
+               this.document = document;
+       }
+       
+       public static void main(String[] args) throws Exception {
+               if(args.length < 1) {
+                       System.err.println("Use:");
+                       System.err.println("  HXFWordExtractor <filename.xlsx>");
+                       System.exit(1);
+               }
+               POIXMLTextExtractor extractor = 
+                       new HXFWordExtractor(HXFDocument.openPackage(
+                                       new File(args[0])
+                       ));
+               System.out.println(extractor.getText());
+       }
+
+       public String getText() {
+               CTBody body = document._getHWPFXML().getDocumentBody();
+               StringBuffer text = new StringBuffer();
+               
+               // Loop over paragraphs
+               CTP[] ps = body.getPArray();
+               for (int i = 0; i < ps.length; i++) {
+                       // Loop over ranges
+                       CTR[] rs = ps[i].getRArray();
+                       for (int j = 0; j < rs.length; j++) {
+                               // Loop over text runs
+                               CTText[] texts = rs[j].getTArray();
+                               for (int k = 0; k < texts.length; k++) {
+                                       text.append(
+                                                       texts[k].getStringValue()
+                                       );
+                               }
+                       }
+                       // New line after each paragraph.
+                       text.append("\n");
+               }
+               
+               return text.toString();
+       }
+}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java
new file mode 100644 (file)
index 0000000..64597e8
--- /dev/null
@@ -0,0 +1,36 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.hwpf.HWPFXML;
+
+/**
+ * High level representation of a ooxml text document.
+ */
+public class HWPFXMLDocument extends POIXMLDocument {
+       private HWPFXML hwpfXML;
+       
+       public HWPFXMLDocument(HWPFXML xml) {
+               super(xml);
+               this.hwpfXML = xml;
+       }
+       
+       public HWPFXML _getHWPFXML() {
+               return hwpfXML;
+       }
+}
index 53f2b025ea95a06ca6f4e0c58eef8ad304c1fa9e..78209b9e1e12b2298eeb2db62d646c5cd7cc288b 100644 (file)
@@ -18,7 +18,6 @@ package org.apache.poi.hwpf;
 
 import java.io.File;
 
-import org.apache.poi.hssf.HSSFXML;
 import org.apache.poi.hxf.HXFDocument;
 import org.openxml4j.opc.Package;
 import org.openxml4j.opc.PackagePart;
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java
new file mode 100644 (file)
index 0000000..62695b3
--- /dev/null
@@ -0,0 +1,117 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFWordExtractor
+ */
+public class TestHXFWordExtractor extends TestCase {
+       /**
+        * A very simple file
+        */
+       private HWPFXML xmlA;
+       /**
+        * A fairly complex file
+        */
+       private HWPFXML xmlB;
+
+       protected void setUp() throws Exception {
+               super.setUp();
+               
+               File fileA = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "sample.docx"
+               );
+               File fileB = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "IllustrativeCases.docx"
+               );
+               
+               xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
+               xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
+       }
+
+       /**
+        * Get text out of the simple file
+        */
+       public void testGetSimpleText() throws Exception {
+               new HXFWordExtractor(xmlA.getPackage());
+               new HXFWordExtractor(new HWPFXMLDocument(xmlA));
+               
+               HXFWordExtractor extractor = 
+                       new HXFWordExtractor(xmlA.getPackage());
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               
+               // Check contents
+               assertTrue(text.startsWith(
+                               "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
+               ));
+               assertTrue(text.endsWith(
+                               "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
+               ));
+               
+               // Check number of paragraphs
+               int ps = 0;
+               char[] t = text.toCharArray();
+               for (int i = 0; i < t.length; i++) {
+                       if(t[i] == '\n') { ps++; }
+               }
+               assertEquals(3, ps);
+       }
+       
+       /**
+        * Tests getting the text out of a complex file
+        */
+       public void testGetComplexText() throws Exception {
+               HXFWordExtractor extractor = 
+                       new HXFWordExtractor(xmlB.getPackage());
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               
+               char euro = '\u20ac';
+               System.err.println("'"+text.substring(text.length() - 20) + "'");
+               
+               // Check contents
+               assertTrue(text.startsWith(
+                               "  \n(V) ILLUSTRATIVE CASES\n\n"
+               ));
+               assertTrue(text.endsWith(
+                               "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
+               ));
+               
+               // Check number of paragraphs
+               int ps = 0;
+               char[] t = text.toCharArray();
+               for (int i = 0; i < t.length; i++) {
+                       if(t[i] == '\n') { ps++; }
+               }
+               assertEquals(79, ps);
+       }
+}