From: Nick Burch Date: Sun, 30 Dec 2007 16:53:42 +0000 (+0000) Subject: Support for extracting text from ooxml word documents X-Git-Tag: REL_3_0_3_BETA1~235 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=699dabdaf8d253e6d32938eac621794452664eae;p=poi.git Support for extracting text from ooxml word documents git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607560 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java index d91f049364..59f83d6d04 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java @@ -16,12 +16,14 @@ ==================================================================== */ package org.apache.poi.hssf.extractor; +import java.io.File; import java.io.IOException; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hssf.HSSFXML; import org.apache.poi.hssf.usermodel.HSSFXMLCell; import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook; +import org.apache.poi.hxf.HXFDocument; import org.apache.xmlbeans.XmlException; import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.opc.Package; @@ -30,6 +32,9 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet; +/** + * Helper class to extract text from an OOXML Excel file + */ public class HXFExcelExtractor extends POIXMLTextExtractor { private HSSFXMLWorkbook workbook; private boolean includeSheetNames = true; @@ -44,6 +49,19 @@ public class HXFExcelExtractor extends POIXMLTextExtractor { super(workbook); this.workbook = workbook; } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" HXFExcelExtractor "); + System.exit(1); + } + POIXMLTextExtractor extractor = + new HXFExcelExtractor(HXFDocument.openPackage( + new File(args[0]) + )); + System.out.println(extractor.getText()); + } /** * Should sheet names be included? Default is true diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java index 0e85e96791..16b93f61f4 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java @@ -19,6 +19,12 @@ package org.apache.poi.hssf.usermodel; import org.apache.poi.POIXMLDocument; import org.apache.poi.hssf.HSSFXML; +/** + * High level representation of a ooxml workbook. + * This is the first object most users will construct whether + * they are reading or writing a workbook. It is also the + * top level object for creating new sheets/etc. + */ public class HSSFXMLWorkbook extends POIXMLDocument { private HSSFXML hssfXML; diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java new file mode 100644 index 0000000000..a4427e49ec --- /dev/null +++ b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java @@ -0,0 +1,87 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.File; +import java.io.IOException; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hwpf.HWPFXML; +import org.apache.poi.hwpf.usermodel.HWPFXMLDocument; +import org.apache.poi.hxf.HXFDocument; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; + +/** + * Helper class to extract text from an OOXML Word file + */ +public class HXFWordExtractor extends POIXMLTextExtractor { + private HWPFXMLDocument document; + + public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { + this(new HWPFXMLDocument( + new HWPFXML(container) + )); + } + public HXFWordExtractor(HWPFXMLDocument document) { + super(document); + this.document = document; + } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" HXFWordExtractor "); + System.exit(1); + } + POIXMLTextExtractor extractor = + new HXFWordExtractor(HXFDocument.openPackage( + new File(args[0]) + )); + System.out.println(extractor.getText()); + } + + public String getText() { + CTBody body = document._getHWPFXML().getDocumentBody(); + StringBuffer text = new StringBuffer(); + + // Loop over paragraphs + CTP[] ps = body.getPArray(); + for (int i = 0; i < ps.length; i++) { + // Loop over ranges + CTR[] rs = ps[i].getRArray(); + for (int j = 0; j < rs.length; j++) { + // Loop over text runs + CTText[] texts = rs[j].getTArray(); + for (int k = 0; k < texts.length; k++) { + text.append( + texts[k].getStringValue() + ); + } + } + // New line after each paragraph. + text.append("\n"); + } + + return text.toString(); + } +} diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java new file mode 100644 index 0000000000..64597e83dc --- /dev/null +++ b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java @@ -0,0 +1,36 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.usermodel; + +import org.apache.poi.POIXMLDocument; +import org.apache.poi.hwpf.HWPFXML; + +/** + * High level representation of a ooxml text document. + */ +public class HWPFXMLDocument extends POIXMLDocument { + private HWPFXML hwpfXML; + + public HWPFXMLDocument(HWPFXML xml) { + super(xml); + this.hwpfXML = xml; + } + + public HWPFXML _getHWPFXML() { + return hwpfXML; + } +} diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java index 53f2b025ea..78209b9e1e 100644 --- a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java +++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java @@ -18,7 +18,6 @@ package org.apache.poi.hwpf; import java.io.File; -import org.apache.poi.hssf.HSSFXML; import org.apache.poi.hxf.HXFDocument; import org.openxml4j.opc.Package; import org.openxml4j.opc.PackagePart; diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java new file mode 100644 index 0000000000..62695b3a8c --- /dev/null +++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java @@ -0,0 +1,117 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.File; + +import org.apache.poi.hwpf.HWPFXML; +import org.apache.poi.hwpf.usermodel.HWPFXMLDocument; +import org.apache.poi.hxf.HXFDocument; + +import junit.framework.TestCase; + +/** + * Tests for HXFWordExtractor + */ +public class TestHXFWordExtractor extends TestCase { + /** + * A very simple file + */ + private HWPFXML xmlA; + /** + * A fairly complex file + */ + private HWPFXML xmlB; + + protected void setUp() throws Exception { + super.setUp(); + + File fileA = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "sample.docx" + ); + File fileB = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "IllustrativeCases.docx" + ); + + xmlA = new HWPFXML(HXFDocument.openPackage(fileA)); + xmlB = new HWPFXML(HXFDocument.openPackage(fileB)); + } + + /** + * Get text out of the simple file + */ + public void testGetSimpleText() throws Exception { + new HXFWordExtractor(xmlA.getPackage()); + new HXFWordExtractor(new HWPFXMLDocument(xmlA)); + + HXFWordExtractor extractor = + new HXFWordExtractor(xmlA.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check contents + assertTrue(text.startsWith( + "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio." + )); + assertTrue(text.endsWith( + "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n" + )); + + // Check number of paragraphs + int ps = 0; + char[] t = text.toCharArray(); + for (int i = 0; i < t.length; i++) { + if(t[i] == '\n') { ps++; } + } + assertEquals(3, ps); + } + + /** + * Tests getting the text out of a complex file + */ + public void testGetComplexText() throws Exception { + HXFWordExtractor extractor = + new HXFWordExtractor(xmlB.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + char euro = '\u20ac'; + System.err.println("'"+text.substring(text.length() - 20) + "'"); + + // Check contents + assertTrue(text.startsWith( + " \n(V) ILLUSTRATIVE CASES\n\n" + )); + assertTrue(text.endsWith( + "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" + )); + + // Check number of paragraphs + int ps = 0; + char[] t = text.toCharArray(); + for (int i = 0; i < t.length; i++) { + if(t[i] == '\n') { ps++; } + } + assertEquals(79, ps); + } +}