From 586fc030ce5d36b9bdff1becfa442867dc710e37 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Tue, 8 Apr 2008 11:43:37 +0000 Subject: [PATCH] Start on a factory for text extractors git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@645861 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/extractor/ExtractorFactory.java | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java diff --git a/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java new file mode 100644 index 0000000000..548697c3c0 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java @@ -0,0 +1,114 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.extractor; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; +import java.util.Iterator; + +import org.openxml4j.exceptions.InvalidFormatException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxml4j.opc.PackagePart; +import org.openxml4j.opc.PackageRelationshipCollection; + +import org.apache.poi.POITextExtractor; +import org.apache.poi.POIXMLDocument; +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.xslf.XSLFSlideShow; +import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xssf.extractor.XSSFExcelExtractor; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.poi.xwpf.XWPFDocument; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.apache.xmlbeans.XmlException; + +/** + * Figures out the correct POITextExtractor for your supplied + * document, and returns it. + */ +public class ExtractorFactory { + public static final String CORE_DOCUMENT_REL = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; + + public POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + FileInputStream finp = new FileInputStream(f); + + if(POIFSFileSystem.hasPOIFSHeader(finp)) { + return createExtractor(new POIFSFileSystem(finp)); + } + if(POIXMLDocument.hasOOXMLHeader(finp)) { + finp.close(); + return createExtractor(Package.open(f.toString())); + } + throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); + } + + public POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + // Figure out the kind of stream + // If clearly doesn't do mark/reset, wrap up + if(! inp.markSupported()) { + inp = new PushbackInputStream(inp, 8); + } + + if(POIFSFileSystem.hasPOIFSHeader(inp)) { + return createExtractor(new POIFSFileSystem(inp)); + } + if(POIXMLDocument.hasOOXMLHeader(inp)) { + return createExtractor(Package.open(inp)); + } + throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); + } + + public POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException { + PackageRelationshipCollection core = + pkg.getRelationshipsByType(CORE_DOCUMENT_REL); + if(core.size() != 1) { + throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); + } + + PackagePart corePart = pkg.getPart(core.getRelationship(0)); + if(corePart.getContentType().equals(XSSFWorkbook.WORKBOOK.getContentType())) { + return new XSSFExcelExtractor(pkg); + } + if(corePart.getContentType().equals(XWPFDocument.MAIN_CONTENT_TYPE)) { + return new XWPFWordExtractor(pkg); + } + if(corePart.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) { + return new XSLFPowerPointExtractor(pkg); + } + throw new IllegalArgumentException("No supported documents found in the OOXML package"); + } + + public POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { + // Look for certain entries in the stream, to figure it + // out from + for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) { + Entry entry = (Entry)entries.next(); + if(entry.getName().equals("Workbook")) { + return new ExcelExtractor(fs); + } + } + throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); + } +} -- 2.39.5