diff options
author | Nick Burch <nick@apache.org> | 2007-12-30 16:53:42 +0000 |
---|---|---|
committer | Nick Burch <nick@apache.org> | 2007-12-30 16:53:42 +0000 |
commit | 699dabdaf8d253e6d32938eac621794452664eae (patch) | |
tree | 8b9fe536be28ce19653901d04af01c94cbd2b52d /src/scratchpad/ooxml-src/org/apache/poi | |
parent | 9b96000af0c686768e57dacc77358f6e7be1e0c2 (diff) | |
download | poi-699dabdaf8d253e6d32938eac621794452664eae.tar.gz poi-699dabdaf8d253e6d32938eac621794452664eae.zip |
Support for extracting text from ooxml word documents
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607560 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/scratchpad/ooxml-src/org/apache/poi')
4 files changed, 147 insertions, 0 deletions
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java index d91f049364..59f83d6d04 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java @@ -16,12 +16,14 @@ ==================================================================== */ package org.apache.poi.hssf.extractor; +import java.io.File; import java.io.IOException; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hssf.HSSFXML; import org.apache.poi.hssf.usermodel.HSSFXMLCell; import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook; +import org.apache.poi.hxf.HXFDocument; import org.apache.xmlbeans.XmlException; import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.opc.Package; @@ -30,6 +32,9 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet; +/** + * Helper class to extract text from an OOXML Excel file + */ public class HXFExcelExtractor extends POIXMLTextExtractor { private HSSFXMLWorkbook workbook; private boolean includeSheetNames = true; @@ -44,6 +49,19 @@ public class HXFExcelExtractor extends POIXMLTextExtractor { super(workbook); this.workbook = workbook; } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" HXFExcelExtractor <filename.xlsx>"); + System.exit(1); + } + POIXMLTextExtractor extractor = + new HXFExcelExtractor(HXFDocument.openPackage( + new File(args[0]) + )); + System.out.println(extractor.getText()); + } /** * Should sheet names be included? Default is true diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java index 0e85e96791..16b93f61f4 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java @@ -19,6 +19,12 @@ package org.apache.poi.hssf.usermodel; import org.apache.poi.POIXMLDocument; import org.apache.poi.hssf.HSSFXML; +/** + * High level representation of a ooxml workbook. + * This is the first object most users will construct whether + * they are reading or writing a workbook. It is also the + * top level object for creating new sheets/etc. + */ public class HSSFXMLWorkbook extends POIXMLDocument { private HSSFXML hssfXML; diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java new file mode 100644 index 0000000000..a4427e49ec --- /dev/null +++ b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java @@ -0,0 +1,87 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.File; +import java.io.IOException; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hwpf.HWPFXML; +import org.apache.poi.hwpf.usermodel.HWPFXMLDocument; +import org.apache.poi.hxf.HXFDocument; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; + +/** + * Helper class to extract text from an OOXML Word file + */ +public class HXFWordExtractor extends POIXMLTextExtractor { + private HWPFXMLDocument document; + + public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { + this(new HWPFXMLDocument( + new HWPFXML(container) + )); + } + public HXFWordExtractor(HWPFXMLDocument document) { + super(document); + this.document = document; + } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" HXFWordExtractor <filename.xlsx>"); + System.exit(1); + } + POIXMLTextExtractor extractor = + new HXFWordExtractor(HXFDocument.openPackage( + new File(args[0]) + )); + System.out.println(extractor.getText()); + } + + public String getText() { + CTBody body = document._getHWPFXML().getDocumentBody(); + StringBuffer text = new StringBuffer(); + + // Loop over paragraphs + CTP[] ps = body.getPArray(); + for (int i = 0; i < ps.length; i++) { + // Loop over ranges + CTR[] rs = ps[i].getRArray(); + for (int j = 0; j < rs.length; j++) { + // Loop over text runs + CTText[] texts = rs[j].getTArray(); + for (int k = 0; k < texts.length; k++) { + text.append( + texts[k].getStringValue() + ); + } + } + // New line after each paragraph. + text.append("\n"); + } + + return text.toString(); + } +} diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java new file mode 100644 index 0000000000..64597e83dc --- /dev/null +++ b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java @@ -0,0 +1,36 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.usermodel; + +import org.apache.poi.POIXMLDocument; +import org.apache.poi.hwpf.HWPFXML; + +/** + * High level representation of a ooxml text document. + */ +public class HWPFXMLDocument extends POIXMLDocument { + private HWPFXML hwpfXML; + + public HWPFXMLDocument(HWPFXML xml) { + super(xml); + this.hwpfXML = xml; + } + + public HWPFXML _getHWPFXML() { + return hwpfXML; + } +} |