From: Dominik Stadler Date: Tue, 12 Jul 2016 15:40:18 +0000 (+0000) Subject: Fix name of new OLE2ScratchpadExtractorFactory X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=7beffbe3e39db94d5218ebf7076510f6812c1859;p=poi.git Fix name of new OLE2ScratchpadExtractorFactory git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752304 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java index d1577898f1..737e9e3510 100644 --- a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java +++ b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java @@ -154,7 +154,7 @@ public class OLE2ExtractorFactory { private static Class getScratchpadClass() { try { return OLE2ExtractorFactory.class.getClassLoader().loadClass( - "org.apache.poi.extractor.OLE2ScrachpadExtractorFactory" + "org.apache.poi.extractor.OLE2ScratchpadExtractorFactory" ); } catch (ClassNotFoundException e) { LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing"); diff --git a/src/scratchpad/src/org/apache/poi/extractor/OLE2ScrachpadExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/OLE2ScrachpadExtractorFactory.java deleted file mode 100644 index 6f8428202f..0000000000 --- a/src/scratchpad/src/org/apache/poi/extractor/OLE2ScrachpadExtractorFactory.java +++ /dev/null @@ -1,145 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.extractor; - -import java.io.ByteArrayInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Iterator; -import java.util.List; - -import org.apache.poi.POIOLE2TextExtractor; -import org.apache.poi.POITextExtractor; -import org.apache.poi.hdgf.extractor.VisioTextExtractor; -import org.apache.poi.hpbf.extractor.PublisherTextExtractor; -import org.apache.poi.hslf.extractor.PowerPointExtractor; -import org.apache.poi.hsmf.MAPIMessage; -import org.apache.poi.hsmf.datatypes.AttachmentChunks; -import org.apache.poi.hsmf.extractor.OutlookTextExtactor; -import org.apache.poi.hwpf.OldWordFileFormatException; -import org.apache.poi.hwpf.extractor.Word6Extractor; -import org.apache.poi.hwpf.extractor.WordExtractor; -import org.apache.poi.poifs.filesystem.DirectoryEntry; -import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.Entry; - -/** - * Scratchpad-specific logic for {@link OLE2ExtractorFactory} and - * {@link ExtractorFactory}, which permit the other two to run with - * no Scratchpad jar (though without functionality!) - *

Note - should not be used standalone, always use via the other - * two classes

- */ -@SuppressWarnings("WeakerAccess") -public class OLE2ScrachpadExtractorFactory { - /** - * Look for certain entries in the stream, to figure it - * out what format is desired - * Note - doesn't check for core-supported formats! - * Note - doesn't check for OOXML-supported formats - */ - public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException { - if (poifsDir.hasEntry("WordDocument")) { - // Old or new style word document? - try { - return new WordExtractor(poifsDir); - } catch (OldWordFileFormatException e) { - return new Word6Extractor(poifsDir); - } - } - - if (poifsDir.hasEntry("PowerPoint Document")) { - return new PowerPointExtractor(poifsDir); - } - - if (poifsDir.hasEntry("VisioDocument")) { - return new VisioTextExtractor(poifsDir); - } - - if (poifsDir.hasEntry("Quill")) { - return new PublisherTextExtractor(poifsDir); - } - - final String[] outlookEntryNames = new String[] { - // message bodies, saved as plain text (PtypString) - // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) - // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry - // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx - // @see org.apache.poi.hsmf.Types.MAPIType - "__substg1.0_1000001E", //PidTagBody ASCII - "__substg1.0_1000001F", //PidTagBody Unicode - "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII - "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode - "__substg1.0_0037001E", //PidTagSubject ASCII - "__substg1.0_0037001F", //PidTagSubject Unicode - }; - for (String entryName : outlookEntryNames) { - if (poifsDir.hasEntry(entryName)) { - return new OutlookTextExtactor(poifsDir); - } - } - - throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); - } - - /** - * Returns an array of text extractors, one for each of - * the embedded documents in the file (if there are any). - * If there are no embedded documents, you'll get back an - * empty array. Otherwise, you'll get one open - * {@link POITextExtractor} for each embedded file. - */ - public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List dirs, List nonPOIFS) throws IOException { - // Find all the embedded directories - DirectoryEntry root = ext.getRoot(); - if(root == null) { - throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); - } - - if(ext instanceof WordExtractor) { - // These are in ObjectPool -> _... under the root - try { - DirectoryEntry op = (DirectoryEntry) - root.getEntry("ObjectPool"); - Iterator it = op.getEntries(); - while(it.hasNext()) { - Entry entry = it.next(); - if(entry.getName().startsWith("_")) { - dirs.add(entry); - } - } - } catch(FileNotFoundException e) { - // ignored here - } - //} else if(ext instanceof PowerPointExtractor) { - // Tricky, not stored directly in poifs - // TODO - } else if(ext instanceof OutlookTextExtactor) { - // Stored in the Attachment blocks - MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); - for(AttachmentChunks attachment : msg.getAttachmentFiles()) { - if(attachment.attachData != null) { - byte[] data = attachment.attachData.getValue(); - nonPOIFS.add( new ByteArrayInputStream(data) ); - } else if(attachment.attachmentDirectory != null) { - dirs.add(attachment.attachmentDirectory.getDirectory()); - } - } - } - } -} diff --git a/src/scratchpad/src/org/apache/poi/extractor/OLE2ScratchpadExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/OLE2ScratchpadExtractorFactory.java new file mode 100644 index 0000000000..07ae6412e7 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/extractor/OLE2ScratchpadExtractorFactory.java @@ -0,0 +1,145 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.extractor; + +import java.io.ByteArrayInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; +import java.util.List; + +import org.apache.poi.POIOLE2TextExtractor; +import org.apache.poi.POITextExtractor; +import org.apache.poi.hdgf.extractor.VisioTextExtractor; +import org.apache.poi.hpbf.extractor.PublisherTextExtractor; +import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hsmf.MAPIMessage; +import org.apache.poi.hsmf.datatypes.AttachmentChunks; +import org.apache.poi.hsmf.extractor.OutlookTextExtactor; +import org.apache.poi.hwpf.OldWordFileFormatException; +import org.apache.poi.hwpf.extractor.Word6Extractor; +import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Entry; + +/** + * Scratchpad-specific logic for {@link OLE2ExtractorFactory} and + * {@link ExtractorFactory}, which permit the other two to run with + * no Scratchpad jar (though without functionality!) + *

Note - should not be used standalone, always use via the other + * two classes

+ */ +@SuppressWarnings("WeakerAccess") +public class OLE2ScratchpadExtractorFactory { + /** + * Look for certain entries in the stream, to figure it + * out what format is desired + * Note - doesn't check for core-supported formats! + * Note - doesn't check for OOXML-supported formats + */ + public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException { + if (poifsDir.hasEntry("WordDocument")) { + // Old or new style word document? + try { + return new WordExtractor(poifsDir); + } catch (OldWordFileFormatException e) { + return new Word6Extractor(poifsDir); + } + } + + if (poifsDir.hasEntry("PowerPoint Document")) { + return new PowerPointExtractor(poifsDir); + } + + if (poifsDir.hasEntry("VisioDocument")) { + return new VisioTextExtractor(poifsDir); + } + + if (poifsDir.hasEntry("Quill")) { + return new PublisherTextExtractor(poifsDir); + } + + final String[] outlookEntryNames = new String[] { + // message bodies, saved as plain text (PtypString) + // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) + // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry + // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx + // @see org.apache.poi.hsmf.Types.MAPIType + "__substg1.0_1000001E", //PidTagBody ASCII + "__substg1.0_1000001F", //PidTagBody Unicode + "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII + "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode + "__substg1.0_0037001E", //PidTagSubject ASCII + "__substg1.0_0037001F", //PidTagSubject Unicode + }; + for (String entryName : outlookEntryNames) { + if (poifsDir.hasEntry(entryName)) { + return new OutlookTextExtactor(poifsDir); + } + } + + throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); + } + + /** + * Returns an array of text extractors, one for each of + * the embedded documents in the file (if there are any). + * If there are no embedded documents, you'll get back an + * empty array. Otherwise, you'll get one open + * {@link POITextExtractor} for each embedded file. + */ + public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List dirs, List nonPOIFS) throws IOException { + // Find all the embedded directories + DirectoryEntry root = ext.getRoot(); + if(root == null) { + throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); + } + + if(ext instanceof WordExtractor) { + // These are in ObjectPool -> _... under the root + try { + DirectoryEntry op = (DirectoryEntry) + root.getEntry("ObjectPool"); + Iterator it = op.getEntries(); + while(it.hasNext()) { + Entry entry = it.next(); + if(entry.getName().startsWith("_")) { + dirs.add(entry); + } + } + } catch(FileNotFoundException e) { + // ignored here + } + //} else if(ext instanceof PowerPointExtractor) { + // Tricky, not stored directly in poifs + // TODO + } else if(ext instanceof OutlookTextExtactor) { + // Stored in the Attachment blocks + MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); + for(AttachmentChunks attachment : msg.getAttachmentFiles()) { + if(attachment.attachData != null) { + byte[] data = attachment.attachData.getValue(); + nonPOIFS.add( new ByteArrayInputStream(data) ); + } else if(attachment.attachmentDirectory != null) { + dirs.add(attachment.attachmentDirectory.getDirectory()); + } + } + } + } +}