- rework extractors - see bugzilla entry for more information git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1880839 13f79535-47bb-0310-9956-ffa450edef68tags/before_ooxml_3rd_edition
@@ -29,11 +29,11 @@ import java.util.HashSet; | |||
import java.util.Set; | |||
import org.apache.poi.EncryptedDocumentException; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.POIOLE2TextExtractor; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; | |||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; | |||
import org.apache.poi.ooxml.extractor.ExtractorFactory; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.ss.extractor.ExcelExtractor; | |||
import org.apache.poi.util.IOUtils; |
@@ -23,7 +23,7 @@ import java.io.File; | |||
import java.io.FileInputStream; | |||
import java.io.InputStream; | |||
import org.apache.poi.ooxml.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.sl.extractor.SlideShowExtractor; | |||
import org.apache.poi.xslf.usermodel.XMLSlideShow; | |||
import org.apache.poi.xslf.usermodel.XSLFSlideShow; | |||
@@ -37,11 +37,11 @@ public class XSLFFileHandler extends SlideShowHandler { | |||
assertNotNull(slideInner.getPresentation()); | |||
assertNotNull(slideInner.getSlideMasterReferences()); | |||
assertNotNull(slideInner.getSlideReferences()); | |||
new POIXMLDocumentHandler().handlePOIXMLDocument(slide); | |||
handleSlideShow(slide); | |||
slideInner.close(); | |||
slide.close(); | |||
} | |||
@@ -49,11 +49,12 @@ public class XSLFFileHandler extends SlideShowHandler { | |||
@Override | |||
public void handleExtracting(File file) throws Exception { | |||
super.handleExtracting(file); | |||
// additionally try the other getText() methods | |||
try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) { | |||
//noinspection rawtypes | |||
try (SlideShowExtractor extractor = (SlideShowExtractor) ExtractorFactory.createExtractor(file)) { | |||
assertNotNull(extractor); | |||
extractor.setSlidesByDefault(true); | |||
extractor.setNotesByDefault(true); |
@@ -0,0 +1,304 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.extractor; | |||
import static org.apache.poi.hssf.record.crypto.Biff8EncryptionKey.getCurrentUserPassword; | |||
import static org.apache.poi.poifs.crypt.EncryptionInfo.ENCRYPTION_INFO_ENTRY; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
import java.util.ServiceLoader; | |||
import java.util.stream.StreamSupport; | |||
import org.apache.poi.EmptyFileException; | |||
import org.apache.poi.hssf.extractor.ExcelExtractor; | |||
import org.apache.poi.poifs.crypt.Decryptor; | |||
import org.apache.poi.poifs.filesystem.DirectoryEntry; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.Entry; | |||
import org.apache.poi.poifs.filesystem.FileMagic; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.util.IOUtils; | |||
import org.apache.poi.util.POILogFactory; | |||
import org.apache.poi.util.POILogger; | |||
/** | |||
* Figures out the correct POIOLE2TextExtractor for your supplied | |||
* document, and returns it. | |||
* | |||
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is | |||
* not present on the runtime classpath</p> | |||
* <p>Note 2 - for text extractor creation across all formats, use | |||
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within | |||
* the OOXML jar.</p> | |||
* <p>Note 3 - rather than using this, for most cases you would be better | |||
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p> | |||
*/ | |||
@SuppressWarnings({"WeakerAccess", "JavadocReference"}) | |||
public final class ExtractorFactory { | |||
private static final POILogger LOGGER = POILogFactory.getLogger(ExtractorFactory.class); | |||
/** Should this thread prefer event based over usermodel based extractors? */ | |||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE); | |||
/** Should all threads prefer event based over usermodel based extractors? */ | |||
private static Boolean allPreferEventExtractors; | |||
private static class Singleton { | |||
private static final ExtractorFactory INSTANCE = new ExtractorFactory(); | |||
} | |||
private interface ProviderMethod { | |||
POITextExtractor create(ExtractorProvider prov) throws IOException; | |||
} | |||
private final List<ExtractorProvider> provider = new ArrayList<>(); | |||
private ExtractorFactory() { | |||
ServiceLoader.load(ExtractorProvider.class).forEach(provider::add); | |||
} | |||
/** | |||
* Should this thread prefer event based over usermodel based extractors? | |||
* (usermodel extractors tend to be more accurate, but use more memory) | |||
* Default is false. | |||
* | |||
* @return true if event extractors should be preferred in the current thread, fals otherwise. | |||
*/ | |||
public static boolean getThreadPrefersEventExtractors() { | |||
return threadPreferEventExtractors.get(); | |||
} | |||
/** | |||
* Should all threads prefer event based over usermodel based extractors? | |||
* (usermodel extractors tend to be more accurate, but use more memory) | |||
* Default is to use the thread level setting, which defaults to false. | |||
* | |||
* @return true if event extractors should be preferred in all threads, fals otherwise. | |||
*/ | |||
public static Boolean getAllThreadsPreferEventExtractors() { | |||
return allPreferEventExtractors; | |||
} | |||
/** | |||
* Should this thread prefer event based over usermodel based extractors? | |||
* Will only be used if the All Threads setting is null. | |||
* | |||
* @param preferEventExtractors If this threads should prefer event based extractors. | |||
*/ | |||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { | |||
threadPreferEventExtractors.set(preferEventExtractors); | |||
} | |||
/** | |||
* Should all threads prefer event based over usermodel based extractors? | |||
* If set, will take preference over the Thread level setting. | |||
* | |||
* @param preferEventExtractors If all threads should prefer event based extractors. | |||
*/ | |||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { | |||
allPreferEventExtractors = preferEventExtractors; | |||
} | |||
/** | |||
* Should this thread use event based extractors is available? | |||
* Checks the all-threads one first, then thread specific. | |||
* | |||
* @return If the current thread should use event based extractors. | |||
*/ | |||
public static boolean getPreferEventExtractor() { | |||
return (allPreferEventExtractors != null) ? allPreferEventExtractors : threadPreferEventExtractors.get(); | |||
} | |||
public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { | |||
return createExtractor(fs, getCurrentUserPassword()); | |||
} | |||
public static POITextExtractor createExtractor(POIFSFileSystem fs, String password) throws IOException { | |||
return createExtractor(fs.getRoot(), password); | |||
} | |||
public static POITextExtractor createExtractor(InputStream input) throws IOException { | |||
return createExtractor(input, getCurrentUserPassword()); | |||
} | |||
public static POITextExtractor createExtractor(InputStream input, String password) throws IOException { | |||
final InputStream is = FileMagic.prepareToCheckMagic(input); | |||
byte[] emptyFileCheck = new byte[1]; | |||
is.mark(emptyFileCheck.length); | |||
if (is.read(emptyFileCheck) < emptyFileCheck.length) { | |||
throw new EmptyFileException(); | |||
} | |||
is.reset(); | |||
final FileMagic fm = FileMagic.valueOf(is); | |||
if (FileMagic.OOXML == fm) { | |||
return wp(fm, w -> w.create(is, password)); | |||
} | |||
if (FileMagic.OLE2 != fm) { | |||
throw new IOException("Can't create extractor - unsupported file type: "+fm); | |||
} | |||
POIFSFileSystem poifs = new POIFSFileSystem(is); | |||
boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY); | |||
return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password)); | |||
} | |||
public static POITextExtractor createExtractor(File file) throws IOException { | |||
return createExtractor(file, getCurrentUserPassword()); | |||
} | |||
public static POITextExtractor createExtractor(File file, String password) throws IOException { | |||
if (file.length() == 0) { | |||
throw new EmptyFileException(); | |||
} | |||
final FileMagic fm = FileMagic.valueOf(file); | |||
if (FileMagic.OOXML == fm) { | |||
return wp(fm, w -> w.create(file, password)); | |||
} | |||
if (FileMagic.OLE2 != fm) { | |||
throw new IOException("Can't create extractor - unsupported file type: "+fm); | |||
} | |||
POIFSFileSystem poifs = new POIFSFileSystem(file, true); | |||
try { | |||
boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY); | |||
return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password)); | |||
} catch (IOException | RuntimeException e) { | |||
IOUtils.closeQuietly(poifs); | |||
throw e; | |||
} | |||
} | |||
/** | |||
* Create the Extractor, if possible. Generally needs the Scratchpad jar. | |||
* Note that this won't check for embedded OOXML resources either, use | |||
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that. | |||
* | |||
* @param root The {@link DirectoryNode} pointing to a document. | |||
* | |||
* @return The resulting {@link POITextExtractor}, an exception is thrown if | |||
* no TextExtractor can be created for some reason. | |||
* | |||
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails | |||
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of | |||
* an unsupported version of Excel. | |||
* @throws IllegalArgumentException If creating the Extractor fails | |||
*/ | |||
public static POITextExtractor createExtractor(DirectoryNode root) throws IOException { | |||
return createExtractor(root, getCurrentUserPassword()); | |||
} | |||
public static POITextExtractor createExtractor(final DirectoryNode root, String password) throws IOException { | |||
// Encrypted OOXML files go inside OLE2 containers, is this one? | |||
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY) || root.hasEntry("Package")) { | |||
return wp(FileMagic.OOXML, w -> w.create(root, password)); | |||
} else { | |||
return wp(FileMagic.OLE2, w -> w.create(root, password)); | |||
} | |||
} | |||
/** | |||
* Returns an array of text extractors, one for each of | |||
* the embedded documents in the file (if there are any). | |||
* If there are no embedded documents, you'll get back an | |||
* empty array. Otherwise, you'll get one open | |||
* {@link POITextExtractor} for each embedded file. | |||
* | |||
* @param ext The extractor to look at for embedded documents | |||
* | |||
* @return An array of resulting extractors. Empty if no embedded documents are found. | |||
* | |||
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails | |||
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of | |||
* an unsupported version of Excel. | |||
* @throws IllegalArgumentException If creating the Extractor fails | |||
*/ | |||
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { | |||
if (ext == null) { | |||
throw new IllegalStateException("extractor must be given"); | |||
} | |||
// All the embedded directories we spotted | |||
List<Entry> dirs = new ArrayList<>(); | |||
// For anything else not directly held in as a POIFS directory | |||
List<InputStream> nonPOIFS = new ArrayList<>(); | |||
// Find all the embedded directories | |||
DirectoryEntry root = ext.getRoot(); | |||
if(root == null) { | |||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); | |||
} | |||
if(ext instanceof ExcelExtractor) { | |||
// These are in MBD... under the root | |||
StreamSupport.stream(root.spliterator(), false) | |||
.filter(entry -> entry.getName().startsWith("MBD")) | |||
.forEach(dirs::add); | |||
} else { | |||
for (ExtractorProvider prov : Singleton.INSTANCE.provider) { | |||
if (prov.accepts(FileMagic.OLE2)) { | |||
prov.identifyEmbeddedResources(ext, dirs, nonPOIFS); | |||
break; | |||
} | |||
} | |||
} | |||
// Create the extractors | |||
if(dirs.size() == 0 && nonPOIFS.size() == 0){ | |||
return new POITextExtractor[0]; | |||
} | |||
ArrayList<POITextExtractor> textExtractors = new ArrayList<>(); | |||
for (Entry dir : dirs) { | |||
textExtractors.add(createExtractor((DirectoryNode) dir)); | |||
} | |||
for (InputStream stream : nonPOIFS) { | |||
try { | |||
textExtractors.add(createExtractor(stream)); | |||
} catch (IOException e) { | |||
// Ignore, just means it didn't contain a format we support as yet | |||
LOGGER.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage()); | |||
} | |||
} | |||
return textExtractors.toArray(new POITextExtractor[0]); | |||
} | |||
private static POITextExtractor wp(FileMagic fm, ProviderMethod fun) throws IOException { | |||
for (ExtractorProvider prov : Singleton.INSTANCE.provider) { | |||
if (prov.accepts(fm)) { | |||
POITextExtractor ext = fun.create(prov); | |||
if (ext != null) { | |||
return ext; | |||
} | |||
} | |||
} | |||
throw new IOException("Your InputStream was neither an OLE2 stream, nor an OOXML stream " + | |||
"or you haven't provide the poi-ooxml*.jar and/or poi-scratchpad*.jar in the classpath/modulepath - FileMagic: "+fm); | |||
} | |||
} |
@@ -0,0 +1,76 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.extractor; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.util.List; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.Entry; | |||
import org.apache.poi.poifs.filesystem.FileMagic; | |||
public interface ExtractorProvider { | |||
boolean accepts(FileMagic fm); | |||
/** | |||
* Create Extractor via file | |||
* @param file the file | |||
* @param password the password or {@code null} if not encrypted | |||
* @return the extractor | |||
* @throws IOException if file can't be read or parsed | |||
*/ | |||
POITextExtractor create(File file, String password) throws IOException; | |||
/** | |||
* Create Extractor via InputStream | |||
* @param inputStream the stream | |||
* @param password the password or {@code null} if not encrypted | |||
* @return the extractor | |||
* @throws IOException if stream can't be read or parsed | |||
*/ | |||
POITextExtractor create(InputStream inputStream, String password) throws IOException; | |||
/** | |||
* Create Extractor from POIFS node | |||
* @param poifsDir the node | |||
* @param password the password or {@code null} if not encrypted | |||
* @return the extractor | |||
* @throws IOException if node can't be parsed | |||
*/ | |||
POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException; | |||
/** | |||
* Returns an array of text extractors, one for each of | |||
* the embedded documents in the file (if there are any). | |||
* If there are no embedded documents, you'll get back an | |||
* empty array. Otherwise, you'll get one open | |||
* {@link POITextExtractor} for each embedded file. | |||
* | |||
* @param ext the extractor holding the directory to start parsing | |||
* @param dirs a list to be filled with directory references holding embedded | |||
* @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries | |||
* | |||
* @throws IOException when the format specific extraction fails because of invalid entires | |||
*/ | |||
default void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException { | |||
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources"); | |||
} | |||
} |
@@ -0,0 +1,76 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.extractor; | |||
import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; | |||
import org.apache.poi.hssf.extractor.ExcelExtractor; | |||
import org.apache.poi.hssf.extractor.OldExcelExtractor; | |||
import org.apache.poi.hssf.model.InternalWorkbook; | |||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.FileMagic; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
/** | |||
* ExtractorFactory for HSSF and Old Excel format | |||
*/ | |||
public class MainExtractorFactory implements ExtractorProvider { | |||
@Override | |||
public boolean accepts(FileMagic fm) { | |||
return FileMagic.OLE2 == fm; | |||
} | |||
@Override | |||
public POITextExtractor create(File file, String password) throws IOException { | |||
return create(new POIFSFileSystem(file, true).getRoot(), password); | |||
} | |||
@Override | |||
public POITextExtractor create(InputStream inputStream, String password) throws IOException { | |||
return create(new POIFSFileSystem(inputStream).getRoot(), password); | |||
} | |||
@Override | |||
public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException { | |||
final String oldPW = Biff8EncryptionKey.getCurrentUserPassword(); | |||
try { | |||
Biff8EncryptionKey.setCurrentUserPassword(password); | |||
// Look for certain entries in the stream, to figure it out from | |||
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) { | |||
if (poifsDir.hasEntry(workbookName)) { | |||
return ExtractorFactory.getPreferEventExtractor() ? new EventBasedExcelExtractor(poifsDir) : new ExcelExtractor(poifsDir); | |||
} | |||
} | |||
if (poifsDir.hasEntry(InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME)) { | |||
return new OldExcelExtractor(poifsDir); | |||
} | |||
} finally { | |||
Biff8EncryptionKey.setCurrentUserPassword(oldPW); | |||
} | |||
return null; | |||
} | |||
} |
@@ -1,279 +0,0 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.extractor; | |||
import static org.apache.poi.hssf.model.InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME; | |||
import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.lang.reflect.Method; | |||
import java.util.ArrayList; | |||
import java.util.Iterator; | |||
import java.util.List; | |||
import org.apache.poi.hssf.OldExcelFormatException; | |||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; | |||
import org.apache.poi.hssf.extractor.ExcelExtractor; | |||
import org.apache.poi.poifs.filesystem.DirectoryEntry; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.Entry; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.util.POILogFactory; | |||
import org.apache.poi.util.POILogger; | |||
/** | |||
* Figures out the correct POIOLE2TextExtractor for your supplied | |||
* document, and returns it. | |||
* | |||
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is | |||
* not present on the runtime classpath</p> | |||
* <p>Note 2 - for text extractor creation across all formats, use | |||
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within | |||
* the OOXML jar.</p> | |||
* <p>Note 3 - rather than using this, for most cases you would be better | |||
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p> | |||
*/ | |||
@SuppressWarnings({"WeakerAccess", "JavadocReference"}) | |||
public final class OLE2ExtractorFactory { | |||
private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class); | |||
/** Should this thread prefer event based over usermodel based extractors? */ | |||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE); | |||
/** Should all threads prefer event based over usermodel based extractors? */ | |||
private static Boolean allPreferEventExtractors; | |||
private OLE2ExtractorFactory() { | |||
} | |||
/** | |||
* Should this thread prefer event based over usermodel based extractors? | |||
* (usermodel extractors tend to be more accurate, but use more memory) | |||
* Default is false. | |||
* | |||
* @return true if event extractors should be preferred in the current thread, fals otherwise. | |||
*/ | |||
public static boolean getThreadPrefersEventExtractors() { | |||
return threadPreferEventExtractors.get(); | |||
} | |||
/** | |||
* Should all threads prefer event based over usermodel based extractors? | |||
* (usermodel extractors tend to be more accurate, but use more memory) | |||
* Default is to use the thread level setting, which defaults to false. | |||
* | |||
* @return true if event extractors should be preferred in all threads, fals otherwise. | |||
*/ | |||
public static Boolean getAllThreadsPreferEventExtractors() { | |||
return allPreferEventExtractors; | |||
} | |||
/** | |||
* Should this thread prefer event based over usermodel based extractors? | |||
* Will only be used if the All Threads setting is null. | |||
* | |||
* @param preferEventExtractors If this threads should prefer event based extractors. | |||
*/ | |||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { | |||
threadPreferEventExtractors.set(preferEventExtractors); | |||
} | |||
/** | |||
* Should all threads prefer event based over usermodel based extractors? | |||
* If set, will take preference over the Thread level setting. | |||
* | |||
* @param preferEventExtractors If all threads should prefer event based extractors. | |||
*/ | |||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { | |||
allPreferEventExtractors = preferEventExtractors; | |||
} | |||
/** | |||
* Should this thread use event based extractors is available? | |||
* Checks the all-threads one first, then thread specific. | |||
* | |||
* @return If the current thread should use event based extractors. | |||
*/ | |||
public static boolean getPreferEventExtractor() { | |||
if(allPreferEventExtractors != null) { | |||
return allPreferEventExtractors; | |||
} | |||
return threadPreferEventExtractors.get(); | |||
} | |||
@SuppressWarnings("unchecked") | |||
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException { | |||
return (T)createExtractor(fs.getRoot()); | |||
} | |||
@SuppressWarnings("unchecked") | |||
public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException { | |||
Class<?> cls = getOOXMLClass(); | |||
if (cls != null) { | |||
// Use Reflection to get us the full OOXML-enabled version | |||
try { | |||
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class); | |||
return (T)m.invoke(null, input); | |||
} catch (IllegalArgumentException iae) { | |||
throw iae; | |||
} catch (Exception e) { | |||
throw new IllegalArgumentException("Error creating Extractor for InputStream", e); | |||
} | |||
} else { | |||
// Best hope it's OLE2.... | |||
return createExtractor(new POIFSFileSystem(input)); | |||
} | |||
} | |||
private static Class<?> getOOXMLClass() { | |||
try { | |||
return OLE2ExtractorFactory.class.getClassLoader().loadClass( | |||
"org.apache.poi.extractor.ExtractorFactory" | |||
); | |||
} catch (ClassNotFoundException e) { | |||
LOGGER.log(POILogger.WARN, "POI OOXML jar missing"); | |||
return null; | |||
} | |||
} | |||
private static Class<?> getScratchpadClass() { | |||
try { | |||
return OLE2ExtractorFactory.class.getClassLoader().loadClass( | |||
"org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory" | |||
); | |||
} catch (ClassNotFoundException e) { | |||
LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing"); | |||
throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory"); | |||
} | |||
} | |||
/** | |||
* Create the Extractor, if possible. Generally needs the Scratchpad jar. | |||
* Note that this won't check for embedded OOXML resources either, use | |||
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that. | |||
* | |||
* @param poifsDir The {@link DirectoryNode} pointing to a document. | |||
* | |||
* @return The resulting {@link POITextExtractor}, an exception is thrown if | |||
* no TextExtractor can be created for some reason. | |||
* | |||
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails | |||
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of | |||
* an unsupported version of Excel. | |||
* @throws IllegalArgumentException If creating the Extractor fails | |||
*/ | |||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException { | |||
// Look for certain entries in the stream, to figure it | |||
// out from | |||
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) { | |||
if (poifsDir.hasEntry(workbookName)) { | |||
if (getPreferEventExtractor()) { | |||
return new EventBasedExcelExtractor(poifsDir); | |||
} | |||
return new ExcelExtractor(poifsDir); | |||
} | |||
} | |||
if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) { | |||
throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) " | |||
+ "found. Please call OldExcelExtractor directly for basic text extraction"); | |||
} | |||
// Ask Scratchpad, or fail trying | |||
Class<?> cls = getScratchpadClass(); | |||
try { | |||
Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class); | |||
POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir); | |||
if (ext != null) return ext; | |||
} catch (IllegalArgumentException iae) { | |||
throw iae; | |||
} catch (Exception e) { | |||
throw new IllegalArgumentException("Error creating Scratchpad Extractor", e); | |||
} | |||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); | |||
} | |||
/** | |||
* Returns an array of text extractors, one for each of | |||
* the embedded documents in the file (if there are any). | |||
* If there are no embedded documents, you'll get back an | |||
* empty array. Otherwise, you'll get one open | |||
* {@link POITextExtractor} for each embedded file. | |||
* | |||
* @param ext The extractor to look at for embedded documents | |||
* | |||
* @return An array of resulting extractors. Empty if no embedded documents are found. | |||
* | |||
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails | |||
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of | |||
* an unsupported version of Excel. | |||
* @throws IllegalArgumentException If creating the Extractor fails | |||
*/ | |||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { | |||
// All the embedded directories we spotted | |||
List<Entry> dirs = new ArrayList<>(); | |||
// For anything else not directly held in as a POIFS directory | |||
List<InputStream> nonPOIFS = new ArrayList<>(); | |||
// Find all the embedded directories | |||
DirectoryEntry root = ext.getRoot(); | |||
if(root == null) { | |||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); | |||
} | |||
if(ext instanceof ExcelExtractor) { | |||
// These are in MBD... under the root | |||
Iterator<Entry> it = root.getEntries(); | |||
while(it.hasNext()) { | |||
Entry entry = it.next(); | |||
if(entry.getName().startsWith("MBD")) { | |||
dirs.add(entry); | |||
} | |||
} | |||
} else { | |||
// Ask Scratchpad, or fail trying | |||
Class<?> cls = getScratchpadClass(); | |||
try { | |||
Method m = cls.getDeclaredMethod( | |||
"identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class); | |||
m.invoke(null, ext, dirs, nonPOIFS); | |||
} catch (Exception e) { | |||
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e); | |||
} | |||
} | |||
// Create the extractors | |||
if(dirs.size() == 0 && nonPOIFS.size() == 0){ | |||
return new POITextExtractor[0]; | |||
} | |||
ArrayList<POITextExtractor> e = new ArrayList<>(); | |||
for (Entry dir : dirs) { | |||
e.add(createExtractor((DirectoryNode) dir | |||
)); | |||
} | |||
for (InputStream stream : nonPOIFS) { | |||
try { | |||
e.add(createExtractor(stream)); | |||
} catch (Exception xe) { | |||
// Ignore, invalid format | |||
LOGGER.log(POILogger.WARN, xe); | |||
} | |||
} | |||
return e.toArray(new POITextExtractor[0]); | |||
} | |||
} |
@@ -30,55 +30,28 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry; | |||
* org.apache.poi.[format].extractor . | |||
* | |||
* @see org.apache.poi.hssf.extractor.ExcelExtractor | |||
* @see org.apache.poi.hslf.extractor.PowerPointExtractor | |||
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor | |||
* @see org.apache.poi.hwpf.extractor.WordExtractor | |||
*/ | |||
public abstract class POIOLE2TextExtractor extends POITextExtractor { | |||
/** The POIDocument that's open */ | |||
protected POIDocument document; | |||
/** | |||
* Creates a new text extractor for the given document | |||
* | |||
* @param document The POIDocument to use in this extractor. | |||
*/ | |||
public POIOLE2TextExtractor(POIDocument document) { | |||
this.document = document; | |||
// Ensure any underlying resources, such as open files, | |||
// will get cleaned up if the user calls #close() | |||
setFilesystem(document); | |||
} | |||
/** | |||
* Creates a new text extractor, using the same | |||
* document as another text extractor. Normally | |||
* only used by properties extractors. | |||
* | |||
* @param otherExtractor the extractor which document to be used | |||
*/ | |||
protected POIOLE2TextExtractor(POIOLE2TextExtractor otherExtractor) { | |||
this.document = otherExtractor.document; | |||
} | |||
public interface POIOLE2TextExtractor extends POITextExtractor { | |||
/** | |||
* Returns the document information metadata for the document | |||
* | |||
* @return The Document Summary Information or null | |||
* if it could not be read for this document. | |||
*/ | |||
public DocumentSummaryInformation getDocSummaryInformation() { | |||
return document.getDocumentSummaryInformation(); | |||
default DocumentSummaryInformation getDocSummaryInformation() { | |||
return getDocument().getDocumentSummaryInformation(); | |||
} | |||
/** | |||
* Returns the summary information metadata for the document. | |||
* | |||
* @return The Summary information for the document or null | |||
* if it could not be read for this document. | |||
*/ | |||
public SummaryInformation getSummaryInformation() { | |||
return document.getSummaryInformation(); | |||
default SummaryInformation getSummaryInformation() { | |||
return getDocument().getSummaryInformation(); | |||
} | |||
/** | |||
@@ -88,7 +61,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { | |||
* @return an instance of POIExtractor that can extract meta-data. | |||
*/ | |||
@Override | |||
public POITextExtractor getMetadataTextExtractor() { | |||
default POITextExtractor getMetadataTextExtractor() { | |||
return new HPSFPropertiesExtractor(this); | |||
} | |||
@@ -97,8 +70,8 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { | |||
* | |||
* @return the DirectoryEntry that is associated with the POIDocument of this extractor. | |||
*/ | |||
public DirectoryEntry getRoot() { | |||
return document.getDirectory(); | |||
default DirectoryEntry getRoot() { | |||
return getDocument().getDirectory(); | |||
} | |||
/** | |||
@@ -107,7 +80,5 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { | |||
* @return the underlying POIDocument | |||
*/ | |||
@Override | |||
public POIDocument getDocument() { | |||
return document; | |||
} | |||
POIDocument getDocument(); | |||
} |
@@ -21,19 +21,16 @@ import java.io.IOException; | |||
/** | |||
* Common Parent for Text Extractors | |||
* of POI Documents. | |||
* of POI Documents. | |||
* You will typically find the implementation of | |||
* a given format's text extractor under | |||
* org.apache.poi.[format].extractor . | |||
* | |||
* | |||
* @see org.apache.poi.hssf.extractor.ExcelExtractor | |||
* @see org.apache.poi.hslf.extractor.PowerPointExtractor | |||
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor | |||
* @see org.apache.poi.hwpf.extractor.WordExtractor | |||
*/ | |||
public abstract class POITextExtractor implements Closeable { | |||
private Closeable fsToClose; | |||
public interface POITextExtractor extends Closeable { | |||
/** | |||
* Retrieves all the text from the document. | |||
* How cells, paragraphs etc are separated in the text | |||
@@ -41,42 +38,50 @@ public abstract class POITextExtractor implements Closeable { | |||
* a specific project for details. | |||
* @return All the text from the document | |||
*/ | |||
public abstract String getText(); | |||
String getText(); | |||
/** | |||
* Returns another text extractor, which is able to | |||
* output the textual content of the document | |||
* metadata / properties, such as author and title. | |||
* | |||
* | |||
* @return the metadata and text extractor | |||
*/ | |||
public abstract POITextExtractor getMetadataTextExtractor(); | |||
POITextExtractor getMetadataTextExtractor(); | |||
/** | |||
* Used to ensure file handle cleanup. | |||
* | |||
* @param fs filesystem to close | |||
* @param doCloseFilesystem {@code true} (default), if underlying resources/filesystem should be | |||
* closed on {@link #close()} | |||
*/ | |||
public void setFilesystem(Closeable fs) { | |||
fsToClose = fs; | |||
} | |||
void setCloseFilesystem(boolean doCloseFilesystem); | |||
/** | |||
* @return {@code true}, if resources/filesystem should be closed on {@link #close()} | |||
*/ | |||
boolean isCloseFilesystem(); | |||
/** | |||
* @return The underlying resources/filesystem | |||
*/ | |||
Closeable getFilesystem(); | |||
/** | |||
* Allows to free resources of the Extractor as soon as | |||
* it is not needed any more. This may include closing | |||
* open file handles and freeing memory. | |||
* | |||
* | |||
* The Extractor cannot be used after close has been called. | |||
*/ | |||
@Override | |||
public void close() throws IOException { | |||
if(fsToClose != null) { | |||
fsToClose.close(); | |||
default void close() throws IOException { | |||
Closeable fs = getFilesystem(); | |||
if (isCloseFilesystem() && fs != null) { | |||
fs.close(); | |||
} | |||
} | |||
/** | |||
* @return the processed document | |||
*/ | |||
public abstract Object getDocument(); | |||
Object getDocument(); | |||
} |
@@ -17,9 +17,6 @@ | |||
package org.apache.poi.hpsf.extractor; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import org.apache.poi.POIDocument; | |||
import org.apache.poi.extractor.POIOLE2TextExtractor; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
@@ -37,15 +34,20 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
* build in and custom, returning them in | |||
* textual form. | |||
*/ | |||
public class HPSFPropertiesExtractor extends POIOLE2TextExtractor { | |||
public class HPSFPropertiesExtractor implements POIOLE2TextExtractor { | |||
private final POIDocument document; | |||
private boolean doCloseFilesystem = true; | |||
public HPSFPropertiesExtractor(POIOLE2TextExtractor mainExtractor) { | |||
super(mainExtractor); | |||
document = mainExtractor.getDocument(); | |||
} | |||
public HPSFPropertiesExtractor(POIDocument doc) { | |||
super(doc); | |||
public HPSFPropertiesExtractor(POIDocument document) { | |||
this.document = document; | |||
} | |||
public HPSFPropertiesExtractor(POIFSFileSystem fs) { | |||
super(new HPSFPropertiesOnlyDocument(fs)); | |||
document = new HPSFPropertiesOnlyDocument(fs); | |||
} | |||
public String getDocumentSummaryInformationText() { | |||
@@ -122,11 +124,11 @@ public class HPSFPropertiesExtractor extends POIOLE2TextExtractor { | |||
} | |||
private static String getPropertyValueText(Object val) { | |||
return (val == null) | |||
return (val == null) | |||
? "(not set)" | |||
: PropertySet.getPropertyStringValue(val); | |||
} | |||
@Override | |||
public boolean equals(Object o) { | |||
return super.equals(o); | |||
@@ -137,12 +139,23 @@ public class HPSFPropertiesExtractor extends POIOLE2TextExtractor { | |||
return super.hashCode(); | |||
} | |||
public static void main(String[] args) throws IOException { | |||
for (String file : args) { | |||
try (HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor( | |||
new POIFSFileSystem(new File(file)))) { | |||
System.out.println(ext.getText()); | |||
} | |||
} | |||
@Override | |||
public POIDocument getDocument() { | |||
return document; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public POIDocument getFilesystem() { | |||
return document; | |||
} | |||
} |
@@ -17,6 +17,7 @@ | |||
package org.apache.poi.hssf.extractor; | |||
import java.io.Closeable; | |||
import java.io.IOException; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
@@ -37,9 +38,9 @@ import org.apache.poi.hssf.record.LabelRecord; | |||
import org.apache.poi.hssf.record.LabelSSTRecord; | |||
import org.apache.poi.hssf.record.NoteRecord; | |||
import org.apache.poi.hssf.record.NumberRecord; | |||
import org.apache.poi.hssf.record.Record; | |||
import org.apache.poi.hssf.record.SSTRecord; | |||
import org.apache.poi.hssf.record.StringRecord; | |||
import org.apache.poi.poifs.filesystem.DirectoryEntry; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
@@ -56,29 +57,31 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
* To turn an excel file into a CSV or similar, then see | |||
* the XLS2CSVmra example | |||
* </p> | |||
* | |||
* | |||
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a> | |||
*/ | |||
public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor { | |||
private DirectoryNode _dir; | |||
public class EventBasedExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor { | |||
private final POIFSFileSystem poifs; | |||
private final DirectoryNode _dir; | |||
private boolean doCloseFilesystem = true; | |||
boolean _includeSheetNames = true; | |||
boolean _formulasNotResults; | |||
public EventBasedExcelExtractor( DirectoryNode dir ) | |||
{ | |||
super( (POIDocument)null ); | |||
public EventBasedExcelExtractor(DirectoryNode dir) { | |||
poifs = null; | |||
_dir = dir; | |||
} | |||
public EventBasedExcelExtractor(POIFSFileSystem fs) { | |||
this(fs.getRoot()); | |||
super.setFilesystem(fs); | |||
poifs = fs; | |||
_dir = fs.getRoot(); | |||
} | |||
/** | |||
* Would return the document information metadata for the document, | |||
* if we supported it | |||
*/ | |||
@Override | |||
public DocumentSummaryInformation getDocSummaryInformation() { | |||
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor"); | |||
} | |||
@@ -86,6 +89,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or | |||
* Would return the summary information metadata for the document, | |||
* if we supported it | |||
*/ | |||
@Override | |||
public SummaryInformation getSummaryInformation() { | |||
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor"); | |||
} | |||
@@ -262,4 +266,29 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or | |||
} | |||
} | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public Closeable getFilesystem() { | |||
return poifs; | |||
} | |||
@Override | |||
public POIDocument getDocument() { | |||
return null; | |||
} | |||
@Override | |||
public DirectoryEntry getRoot() { | |||
return _dir; | |||
} | |||
} |
@@ -50,12 +50,13 @@ import org.apache.poi.ss.usermodel.Row.MissingCellPolicy; | |||
* To turn an excel file into a CSV or similar, then see | |||
* the XLS2CSVmra example | |||
* </p> | |||
* | |||
* | |||
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a> | |||
*/ | |||
public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor { | |||
public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor { | |||
private final HSSFWorkbook _wb; | |||
private final HSSFDataFormatter _formatter; | |||
private boolean doCloseFilesystem = true; | |||
private boolean _includeSheetNames = true; | |||
private boolean _shouldEvaluateFormulas = true; | |||
private boolean _includeCellComments; | |||
@@ -63,13 +64,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p | |||
private boolean _includeHeadersFooters = true; | |||
public ExcelExtractor(HSSFWorkbook wb) { | |||
super(wb); | |||
_wb = wb; | |||
_formatter = new HSSFDataFormatter(); | |||
} | |||
public ExcelExtractor(POIFSFileSystem fs) throws IOException { | |||
this(fs.getRoot()); | |||
} | |||
public ExcelExtractor(DirectoryNode dir) throws IOException { | |||
this(new HSSFWorkbook(dir, true)); | |||
} | |||
@@ -201,9 +203,9 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p | |||
/** | |||
* Command line extractor. | |||
* | |||
* | |||
* @param args the command line parameters | |||
* | |||
* | |||
* @throws IOException if the file can't be read or contains errors | |||
*/ | |||
public static void main(String[] args) throws IOException { | |||
@@ -225,7 +227,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p | |||
try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile()); | |||
HSSFWorkbook wb = new HSSFWorkbook(is); | |||
ExcelExtractor extractor = new ExcelExtractor(wb); | |||
ExcelExtractor extractor = new ExcelExtractor(wb) | |||
) { | |||
extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames()); | |||
extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas()); | |||
@@ -255,7 +257,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p | |||
* Should blank cells be output? Default is to only | |||
* output cells that are present in the file and are | |||
* non-blank. | |||
* | |||
* | |||
* @param includeBlankCells {@code true} if blank cells should be included | |||
*/ | |||
public void setIncludeBlankCells(boolean includeBlankCells) { | |||
@@ -411,4 +413,24 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p | |||
return text.toString(); | |||
} | |||
@Override | |||
public HSSFWorkbook getDocument() { | |||
return _wb; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public HSSFWorkbook getFilesystem() { | |||
return _wb; | |||
} | |||
} |
@@ -29,6 +29,7 @@ import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.EncryptedDocumentException; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.hssf.OldExcelFormatException; | |||
import org.apache.poi.hssf.record.BOFRecord; | |||
import org.apache.poi.hssf.record.CodepageRecord; | |||
@@ -58,7 +59,7 @@ import org.apache.poi.util.IOUtils; | |||
* by Apache Tika, but not really intended for display to the user. | |||
* </p> | |||
*/ | |||
public class OldExcelExtractor implements Closeable { | |||
public class OldExcelExtractor implements POITextExtractor { | |||
private final static int FILE_PASS_RECORD_SID = 0x2f; | |||
//arbitrarily selected; may need to increase | |||
@@ -295,24 +296,39 @@ public class OldExcelExtractor implements Closeable { | |||
} | |||
} | |||
close(); | |||
ris = null; | |||
return text.toString(); | |||
} | |||
@Override | |||
public void close() { | |||
// some cases require this close here | |||
if(toClose != null) { | |||
IOUtils.closeQuietly(toClose); | |||
toClose = null; | |||
} | |||
} | |||
protected void handleNumericCell(StringBuilder text, double value) { | |||
// TODO Need to fetch / use format strings | |||
text.append(value); | |||
text.append('\n'); | |||
} | |||
@Override | |||
public POITextExtractor getMetadataTextExtractor() { | |||
return null; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return toClose != null; | |||
} | |||
@Override | |||
public Closeable getFilesystem() { | |||
return toClose; | |||
} | |||
@Override | |||
public Object getDocument() { | |||
return ris; | |||
} | |||
} |
@@ -54,14 +54,14 @@ import org.apache.poi.util.POILogger; | |||
public class SlideShowExtractor< | |||
S extends Shape<S,P>, | |||
P extends TextParagraph<S,P,? extends TextRun> | |||
> extends POITextExtractor { | |||
> implements POITextExtractor { | |||
private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class); | |||
// placeholder text for slide numbers | |||
private static final String SLIDE_NUMBER_PH = "‹#›"; | |||
private SlideShow<S,P> slideshow; | |||
protected final SlideShow<S,P> slideshow; | |||
private boolean slidesByDefault = true; | |||
private boolean notesByDefault; | |||
@@ -69,9 +69,9 @@ public class SlideShowExtractor< | |||
private boolean masterByDefault; | |||
private Predicate<Object> filter = o -> true; | |||
private boolean doCloseFilesystem = true; | |||
public SlideShowExtractor(final SlideShow<S,P> slideshow) { | |||
setFilesystem(slideshow); | |||
this.slideshow = slideshow; | |||
} | |||
@@ -81,8 +81,8 @@ public class SlideShowExtractor< | |||
* @return the opened document | |||
*/ | |||
@Override | |||
public final Object getDocument() { | |||
return slideshow.getPersistDocument(); | |||
public SlideShow<S,P> getDocument() { | |||
return slideshow; | |||
} | |||
/** | |||
@@ -339,17 +339,17 @@ public class SlideShowExtractor< | |||
return raw; | |||
} | |||
TextParagraph tp = tr.getParagraph(); | |||
TextShape ps = (tp != null) ? tp.getParentShape() : null; | |||
Sheet sh = (ps != null) ? ps.getSheet() : null; | |||
String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide)sh).getSlideNumber() + 1) : ""; | |||
TextParagraph<?,?,?> tp = tr.getParagraph(); | |||
TextShape<?,?> ps = (tp != null) ? tp.getParentShape() : null; | |||
Sheet<?,?> sh = (ps != null) ? ps.getSheet() : null; | |||
String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide<?,?>)sh).getSlideNumber() + 1) : ""; | |||
return raw.replace(SLIDE_NUMBER_PH, slideNr); | |||
} | |||
private static String replaceTextCap(TextRun tr) { | |||
final TextParagraph tp = tr.getParagraph(); | |||
final TextShape sh = (tp != null) ? tp.getParentShape() : null; | |||
final TextParagraph<?,?,?> tp = tr.getParagraph(); | |||
final TextShape<?,?> sh = (tp != null) ? tp.getParentShape() : null; | |||
final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null; | |||
// 0xB acts like cariage return in page titles and like blank in the others | |||
@@ -438,4 +438,19 @@ public class SlideShowExtractor< | |||
(italic == null || tr.isItalic() == italic) && | |||
(bold == null || tr.isBold() == bold); | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public SlideShow<S,P> getFilesystem() { | |||
return getDocument(); | |||
} | |||
} |
@@ -24,39 +24,39 @@ public interface ExcelExtractor { | |||
/** | |||
* Should sheet names be included? | |||
* Default is true | |||
* | |||
* | |||
* @param includeSheetNames {@code true} if the sheet names should be included | |||
*/ | |||
public void setIncludeSheetNames(boolean includeSheetNames); | |||
void setIncludeSheetNames(boolean includeSheetNames); | |||
/** | |||
* Should we return the formula itself, and not the result it produces? | |||
* Default is false | |||
* | |||
* | |||
* @param formulasNotResults {@code true} if the formula itself is returned | |||
*/ | |||
public void setFormulasNotResults(boolean formulasNotResults); | |||
void setFormulasNotResults(boolean formulasNotResults); | |||
/** | |||
* Should headers and footers be included in the output? | |||
* Default is true | |||
* | |||
* | |||
* @param includeHeadersFooters {@code true} if headers and footers should be included | |||
*/ | |||
public void setIncludeHeadersFooters(boolean includeHeadersFooters); | |||
void setIncludeHeadersFooters(boolean includeHeadersFooters); | |||
/** | |||
* Should cell comments be included? | |||
* Default is false | |||
* | |||
* | |||
* @param includeCellComments {@code true} if cell comments should be included | |||
*/ | |||
public void setIncludeCellComments(boolean includeCellComments); | |||
void setIncludeCellComments(boolean includeCellComments); | |||
/** | |||
* Retrieves the text contents of the file | |||
* | |||
* | |||
* @return the text contents of the file | |||
*/ | |||
public String getText(); | |||
String getText(); | |||
} |
@@ -29,6 +29,7 @@ module org.apache.poi.ooxml { | |||
requires java.security.jgss; | |||
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory; | |||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory; | |||
exports org.apache.poi.xwpf.extractor; | |||
exports org.apache.poi.xwpf.usermodel; |
@@ -29,6 +29,7 @@ module org.apache.poi.ooxml { | |||
requires java.security.jgss; | |||
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory; | |||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory; | |||
exports org.apache.poi.xwpf.extractor; | |||
exports org.apache.poi.xwpf.usermodel; |
@@ -28,8 +28,12 @@ module org.apache.poi.poi { | |||
requires jdk.unsupported; | |||
uses org.apache.poi.ss.usermodel.WorkbookProvider; | |||
uses org.apache.poi.extractor.ExtractorProvider; | |||
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory; | |||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory; | |||
exports org.apache.poi; | |||
exports org.apache.poi.common; |
@@ -28,8 +28,10 @@ module org.apache.poi.poi { | |||
requires jdk.unsupported; | |||
uses org.apache.poi.ss.usermodel.WorkbookProvider; | |||
uses org.apache.poi.extractor.ExtractorProvider; | |||
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory; | |||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory; | |||
exports org.apache.poi; | |||
exports org.apache.poi.common; |
@@ -20,6 +20,8 @@ module org.apache.poi.scratchpad { | |||
requires java.desktop; | |||
requires commons.math3; | |||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory; | |||
exports org.apache.poi.hmef; | |||
exports org.apache.poi.hmef.dev; | |||
exports org.apache.poi.hmef.extractor; |
@@ -20,6 +20,8 @@ module org.apache.poi.scratchpad { | |||
requires java.desktop; | |||
requires commons.math3; | |||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory; | |||
exports org.apache.poi.hmef; | |||
exports org.apache.poi.hmef.dev; | |||
exports org.apache.poi.hmef.extractor; |
@@ -18,15 +18,19 @@ package org.apache.poi.ooxml.extractor; | |||
import java.io.File; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
/** | |||
* A command line wrapper around {@link ExtractorFactory}, useful | |||
* for when debugging. | |||
*/ | |||
public class CommandLineTextExtractor { | |||
public final class CommandLineTextExtractor { | |||
public static final String DIVIDER = "======================="; | |||
private CommandLineTextExtractor() { | |||
} | |||
public static void main(String[] args) throws Exception { | |||
if (args.length < 1) { | |||
System.err.println("Use:"); |
@@ -1,384 +0,0 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.ooxml.extractor; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.lang.reflect.Method; | |||
import java.util.ArrayList; | |||
import java.util.Iterator; | |||
import java.util.List; | |||
import org.apache.poi.EncryptedDocumentException; | |||
import org.apache.poi.extractor.OLE2ExtractorFactory; | |||
import org.apache.poi.extractor.POIOLE2TextExtractor; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.hssf.extractor.ExcelExtractor; | |||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.openxml4j.opc.PackageAccess; | |||
import org.apache.poi.openxml4j.opc.PackagePart; | |||
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; | |||
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; | |||
import org.apache.poi.poifs.crypt.Decryptor; | |||
import org.apache.poi.poifs.crypt.EncryptionInfo; | |||
import org.apache.poi.poifs.filesystem.DirectoryEntry; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.Entry; | |||
import org.apache.poi.poifs.filesystem.FileMagic; | |||
import org.apache.poi.poifs.filesystem.NotOLE2FileException; | |||
import org.apache.poi.poifs.filesystem.OfficeXmlFileException; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.sl.extractor.SlideShowExtractor; | |||
import org.apache.poi.util.IOUtils; | |||
import org.apache.poi.util.NotImplemented; | |||
import org.apache.poi.util.POILogFactory; | |||
import org.apache.poi.util.POILogger; | |||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; | |||
import org.apache.poi.xslf.usermodel.XMLSlideShow; | |||
import org.apache.poi.xslf.usermodel.XSLFRelation; | |||
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; | |||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; | |||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor; | |||
import org.apache.poi.xssf.usermodel.XSSFRelation; | |||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor; | |||
import org.apache.poi.xwpf.usermodel.XWPFRelation; | |||
import org.apache.xmlbeans.XmlException; | |||
/** | |||
* Figures out the correct POITextExtractor for your supplied | |||
* document, and returns it. | |||
* | |||
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is | |||
* not present on the runtime classpath</p> | |||
* <p>Note 2 - rather than using this, for most cases you would be better | |||
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p> | |||
*/ | |||
@SuppressWarnings("WeakerAccess") | |||
public final class ExtractorFactory { | |||
private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class); | |||
public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; | |||
private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; | |||
private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; | |||
private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{ | |||
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE, | |||
XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE, | |||
XSLFRelation.PRESENTATION_MACRO | |||
}; | |||
private ExtractorFactory() { | |||
} | |||
/** | |||
* Should this thread prefer event based over usermodel based extractors? | |||
* (usermodel extractors tend to be more accurate, but use more memory) | |||
* Default is false. | |||
*/ | |||
public static boolean getThreadPrefersEventExtractors() { | |||
return OLE2ExtractorFactory.getThreadPrefersEventExtractors(); | |||
} | |||
/** | |||
* Should all threads prefer event based over usermodel based extractors? | |||
* (usermodel extractors tend to be more accurate, but use more memory) | |||
* Default is to use the thread level setting, which defaults to false. | |||
*/ | |||
public static Boolean getAllThreadsPreferEventExtractors() { | |||
return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors(); | |||
} | |||
/** | |||
* Should this thread prefer event based over usermodel based extractors? | |||
* Will only be used if the All Threads setting is null. | |||
*/ | |||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { | |||
OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); | |||
} | |||
/** | |||
* Should all threads prefer event based over usermodel based extractors? | |||
* If set, will take preference over the Thread level setting. | |||
*/ | |||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { | |||
OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); | |||
} | |||
/** | |||
* Should this thread use event based extractors is available? | |||
* Checks the all-threads one first, then thread specific. | |||
*/ | |||
public static boolean getPreferEventExtractor() { | |||
return OLE2ExtractorFactory.getPreferEventExtractor(); | |||
} | |||
@SuppressWarnings("unchecked") | |||
public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException { | |||
POIFSFileSystem fs = null; | |||
try { | |||
fs = new POIFSFileSystem(f); | |||
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { | |||
return (T)createEncryptedOOXMLExtractor(fs); | |||
} | |||
POITextExtractor extractor = createExtractor(fs); | |||
extractor.setFilesystem(fs); | |||
return (T)extractor; | |||
} catch (OfficeXmlFileException e) { | |||
// ensure file-handle release | |||
IOUtils.closeQuietly(fs); | |||
OPCPackage pkg = OPCPackage.open(f.toString(), PackageAccess.READ); | |||
T t = (T)createExtractor(pkg); | |||
t.setFilesystem(pkg); | |||
return t; | |||
} catch (NotOLE2FileException ne) { | |||
// ensure file-handle release | |||
IOUtils.closeQuietly(fs); | |||
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne); | |||
} catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR | |||
// ensure file-handle release | |||
IOUtils.closeQuietly(fs); | |||
throw e; | |||
} | |||
} | |||
public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException { | |||
InputStream is = FileMagic.prepareToCheckMagic(inp); | |||
FileMagic fm = FileMagic.valueOf(is); | |||
switch (fm) { | |||
case OLE2: | |||
POIFSFileSystem fs = new POIFSFileSystem(is); | |||
boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY); | |||
return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs); | |||
case OOXML: | |||
return createExtractor(OPCPackage.open(is)); | |||
default: | |||
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream, found type: " + fm); | |||
} | |||
} | |||
/** | |||
* Tries to determine the actual type of file and produces a matching text-extractor for it. | |||
* | |||
* @param pkg An {@link OPCPackage}. | |||
* @return A {@link POIXMLTextExtractor} for the given file. | |||
* @throws IOException If an error occurs while reading the file | |||
* @throws OpenXML4JException If an error parsing the OpenXML file format is found. | |||
* @throws XmlException If an XML parsing error occurs. | |||
* @throws IllegalArgumentException If no matching file type could be found. | |||
*/ | |||
public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { | |||
try { | |||
// Check for the normal Office core document | |||
PackageRelationshipCollection core; | |||
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL); | |||
// If nothing was found, try some of the other OOXML-based core types | |||
if (core.size() == 0) { | |||
// Could it be an OOXML-Strict one? | |||
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); | |||
} | |||
if (core.size() == 0) { | |||
// Could it be a visio one? | |||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); | |||
if (core.size() == 1) | |||
return new XDGFVisioExtractor(pkg); | |||
} | |||
// Should just be a single core document, complain if not | |||
if (core.size() != 1) { | |||
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); | |||
} | |||
// Grab the core document part, and try to identify from that | |||
final PackagePart corePart = pkg.getPart(core.getRelationship(0)); | |||
final String contentType = corePart.getContentType(); | |||
// Is it XSSF? | |||
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { | |||
if ( rel.getContentType().equals( contentType ) ) { | |||
if (getPreferEventExtractor()) { | |||
return new XSSFEventBasedExcelExtractor(pkg); | |||
} | |||
return new XSSFExcelExtractor(pkg); | |||
} | |||
} | |||
// Is it XWPF? | |||
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { | |||
if ( rel.getContentType().equals( contentType ) ) { | |||
return new XWPFWordExtractor(pkg); | |||
} | |||
} | |||
// Is it XSLF? | |||
for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) { | |||
if ( rel.getContentType().equals( contentType ) ) { | |||
return new SlideShowExtractor<>(new XMLSlideShow(pkg)); | |||
} | |||
} | |||
// special handling for SlideShow-Theme-files, | |||
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) { | |||
return new SlideShowExtractor<>(new XMLSlideShow(pkg)); | |||
} | |||
// How about xlsb? | |||
for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) { | |||
if (rel.getContentType().equals(contentType)) { | |||
return new XSSFBEventBasedExcelExtractor(pkg); | |||
} | |||
} | |||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")"); | |||
} catch (IOException | Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR | |||
// ensure that we close the package again if there is an error opening it, however | |||
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! | |||
pkg.revert(); | |||
throw e; | |||
} | |||
} | |||
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { | |||
return createExtractor(fs.getRoot()); | |||
} | |||
@SuppressWarnings("unchecked") | |||
public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException | |||
{ | |||
// First, check for OOXML | |||
for (String entryName : poifsDir.getEntryNames()) { | |||
if (entryName.equals("Package")) { | |||
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package")); | |||
return (T)createExtractor(pkg); | |||
} | |||
} | |||
// If not, ask the OLE2 code to check, with Scratchpad if possible | |||
return (T)OLE2ExtractorFactory.createExtractor(poifsDir); | |||
} | |||
/** | |||
* Returns an array of text extractors, one for each of | |||
* the embedded documents in the file (if there are any). | |||
* If there are no embedded documents, you'll get back an | |||
* empty array. Otherwise, you'll get one open | |||
* {@link POITextExtractor} for each embedded file. | |||
*/ | |||
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException { | |||
// All the embedded directories we spotted | |||
ArrayList<Entry> dirs = new ArrayList<>(); | |||
// For anything else not directly held in as a POIFS directory | |||
ArrayList<InputStream> nonPOIFS = new ArrayList<>(); | |||
// Find all the embedded directories | |||
DirectoryEntry root = ext.getRoot(); | |||
if (root == null) { | |||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); | |||
} | |||
// provide ExcelExtractor also in OOXML module, because scratchpad is not necessary for it | |||
if (ext instanceof ExcelExtractor) { | |||
// These are in MBD... under the root | |||
Iterator<Entry> it = root.getEntries(); | |||
while (it.hasNext()) { | |||
Entry entry = it.next(); | |||
if (entry.getName().startsWith("MBD")) { | |||
dirs.add(entry); | |||
} | |||
} | |||
} else { | |||
try { | |||
Class<?> clazz = Class.forName("org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory"); | |||
Method m = clazz.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class); | |||
m.invoke(null, ext, dirs, nonPOIFS); | |||
} catch (ReflectiveOperationException e) { | |||
logger.log(POILogger.WARN, "POI Scratchpad jar not included ", e.getLocalizedMessage()); | |||
return new POITextExtractor[0]; | |||
} | |||
} | |||
// Create the extractors | |||
if (dirs.size() == 0 && nonPOIFS.size() == 0){ | |||
return new POITextExtractor[0]; | |||
} | |||
ArrayList<POITextExtractor> textExtractors = new ArrayList<>(); | |||
for (Entry dir : dirs) { | |||
textExtractors.add(createExtractor((DirectoryNode) dir)); | |||
} | |||
for (InputStream nonPOIF : nonPOIFS) { | |||
try { | |||
textExtractors.add(createExtractor(nonPOIF)); | |||
} catch (IllegalArgumentException e) { | |||
// Ignore, just means it didn't contain | |||
// a format we support as yet | |||
logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage()); | |||
} catch (XmlException | OpenXML4JException e) { | |||
throw new IOException(e.getMessage(), e); | |||
} | |||
} | |||
return textExtractors.toArray(new POITextExtractor[0]); | |||
} | |||
/** | |||
* Returns an array of text extractors, one for each of | |||
* the embedded documents in the file (if there are any). | |||
* If there are no embedded documents, you'll get back an | |||
* empty array. Otherwise, you'll get one open | |||
* {@link POITextExtractor} for each embedded file. | |||
*/ | |||
@NotImplemented | |||
@SuppressWarnings({"UnusedParameters", "UnusedReturnValue"}) | |||
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) { | |||
throw new IllegalStateException("Not yet supported"); | |||
} | |||
private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs) | |||
throws IOException { | |||
String pass = Biff8EncryptionKey.getCurrentUserPassword(); | |||
if (pass == null) { | |||
pass = Decryptor.DEFAULT_PASSWORD; | |||
} | |||
EncryptionInfo ei = new EncryptionInfo(fs); | |||
Decryptor dec = ei.getDecryptor(); | |||
InputStream is = null; | |||
try { | |||
if (!dec.verifyPassword(pass)) { | |||
throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor"); | |||
} | |||
is = dec.getDataStream(fs); | |||
return createExtractor(OPCPackage.open(is)); | |||
} catch (IOException e) { | |||
throw e; | |||
} catch (Exception e) { | |||
throw new EncryptedDocumentException(e); | |||
} finally { | |||
IOUtils.closeQuietly(is); | |||
// also close the POIFSFileSystem here as we read all the data | |||
// while decrypting | |||
fs.close(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,281 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.ooxml.extractor; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.ExtractorProvider; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; | |||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.openxml4j.opc.PackageAccess; | |||
import org.apache.poi.openxml4j.opc.PackagePart; | |||
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; | |||
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; | |||
import org.apache.poi.poifs.crypt.Decryptor; | |||
import org.apache.poi.poifs.crypt.EncryptionInfo; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.FileMagic; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; | |||
import org.apache.poi.xslf.extractor.XSLFExtractor; | |||
import org.apache.poi.xslf.usermodel.XMLSlideShow; | |||
import org.apache.poi.xslf.usermodel.XSLFRelation; | |||
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; | |||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; | |||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor; | |||
import org.apache.poi.xssf.usermodel.XSSFRelation; | |||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor; | |||
import org.apache.poi.xwpf.usermodel.XWPFRelation; | |||
import org.apache.xmlbeans.XmlException; | |||
/** | |||
* Figures out the correct POITextExtractor for your supplied | |||
* document, and returns it. | |||
* | |||
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is | |||
* not present on the runtime classpath</p> | |||
* <p>Note 2 - rather than using this, for most cases you would be better | |||
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p> | |||
*/ | |||
@SuppressWarnings("WeakerAccess") | |||
public final class POIXMLExtractorFactory implements ExtractorProvider { | |||
private static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; | |||
private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; | |||
private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; | |||
private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{ | |||
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE, | |||
XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE, | |||
XSLFRelation.PRESENTATION_MACRO | |||
}; | |||
@Override | |||
public boolean accepts(FileMagic fm) { | |||
return fm == FileMagic.OOXML; | |||
} | |||
/** | |||
* Should this thread prefer event based over usermodel based extractors? | |||
* (usermodel extractors tend to be more accurate, but use more memory) | |||
* Default is false. | |||
*/ | |||
public static boolean getThreadPrefersEventExtractors() { | |||
return ExtractorFactory.getThreadPrefersEventExtractors(); | |||
} | |||
/** | |||
* Should all threads prefer event based over usermodel based extractors? | |||
* (usermodel extractors tend to be more accurate, but use more memory) | |||
* Default is to use the thread level setting, which defaults to false. | |||
*/ | |||
public static Boolean getAllThreadsPreferEventExtractors() { | |||
return ExtractorFactory.getAllThreadsPreferEventExtractors(); | |||
} | |||
/** | |||
* Should this thread prefer event based over usermodel based extractors? | |||
* Will only be used if the All Threads setting is null. | |||
*/ | |||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { | |||
ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); | |||
} | |||
/** | |||
* Should all threads prefer event based over usermodel based extractors? | |||
* If set, will take preference over the Thread level setting. | |||
*/ | |||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { | |||
ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); | |||
} | |||
/** | |||
* Should this thread use event based extractors is available? | |||
* Checks the all-threads one first, then thread specific. | |||
*/ | |||
public static boolean getPreferEventExtractor() { | |||
return ExtractorFactory.getPreferEventExtractor(); | |||
} | |||
@Override | |||
public POITextExtractor create(File f, String password) throws IOException { | |||
if (FileMagic.valueOf(f) != FileMagic.OOXML) { | |||
return ExtractorFactory.createExtractor(f, password); | |||
} | |||
OPCPackage pkg = null; | |||
try { | |||
pkg = OPCPackage.open(f.toString(), PackageAccess.READ); | |||
POIXMLTextExtractor ex = create(pkg); | |||
if (ex == null) { | |||
pkg.revert(); | |||
} | |||
return ex; | |||
} catch (InvalidFormatException ife) { | |||
throw new IOException(ife); | |||
} catch (IOException e) { | |||
pkg.revert(); | |||
throw e; | |||
} | |||
} | |||
public POITextExtractor create(InputStream inp, String password) throws IOException { | |||
InputStream is = FileMagic.prepareToCheckMagic(inp); | |||
if (FileMagic.valueOf(is) != FileMagic.OOXML) { | |||
return ExtractorFactory.createExtractor(is, password); | |||
} | |||
OPCPackage pkg = null; | |||
try { | |||
pkg = OPCPackage.open(is); | |||
POIXMLTextExtractor ex = create(pkg); | |||
if (ex == null) { | |||
pkg.revert(); | |||
} | |||
return ex; | |||
} catch (InvalidFormatException e) { | |||
throw new IOException(e); | |||
} catch (RuntimeException | IOException e) { | |||
if (pkg != null) { | |||
pkg.revert(); | |||
} | |||
throw e; | |||
} | |||
} | |||
/** | |||
* Tries to determine the actual type of file and produces a matching text-extractor for it. | |||
* | |||
* @param pkg An {@link OPCPackage}. | |||
* @return A {@link POIXMLTextExtractor} for the given file. | |||
* @throws IOException If an error occurs while reading the file | |||
* @throws IllegalArgumentException If no matching file type could be found. | |||
*/ | |||
public POIXMLTextExtractor create(OPCPackage pkg) throws IOException { | |||
try { | |||
// Check for the normal Office core document | |||
PackageRelationshipCollection core; | |||
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL); | |||
// If nothing was found, try some of the other OOXML-based core types | |||
if (core.size() == 0) { | |||
// Could it be an OOXML-Strict one? | |||
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); | |||
} | |||
if (core.size() == 0) { | |||
// Could it be a visio one? | |||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); | |||
if (core.size() == 1) { | |||
return new XDGFVisioExtractor(pkg); | |||
} | |||
} | |||
// Should just be a single core document, complain if not | |||
if (core.size() != 1) { | |||
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); | |||
} | |||
// Grab the core document part, and try to identify from that | |||
final PackagePart corePart = pkg.getPart(core.getRelationship(0)); | |||
final String contentType = corePart.getContentType(); | |||
// Is it XSSF? | |||
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { | |||
if (rel.getContentType().equals(contentType)) { | |||
if (getPreferEventExtractor()) { | |||
return new XSSFEventBasedExcelExtractor(pkg); | |||
} | |||
return new XSSFExcelExtractor(pkg); | |||
} | |||
} | |||
// Is it XWPF? | |||
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { | |||
if (rel.getContentType().equals(contentType)) { | |||
return new XWPFWordExtractor(pkg); | |||
} | |||
} | |||
// Is it XSLF? | |||
for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) { | |||
if (rel.getContentType().equals(contentType)) { | |||
return new XSLFExtractor(new XMLSlideShow(pkg)); | |||
} | |||
} | |||
// special handling for SlideShow-Theme-files, | |||
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) { | |||
return new XSLFExtractor(new XMLSlideShow(pkg)); | |||
} | |||
// How about xlsb? | |||
for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) { | |||
if (rel.getContentType().equals(contentType)) { | |||
return new XSSFBEventBasedExcelExtractor(pkg); | |||
} | |||
} | |||
return null; | |||
} catch (IOException e) { | |||
throw e; | |||
} catch (Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR | |||
throw new IOException(e); | |||
} | |||
// we used to close (revert()) the package here, but this is the callers responsibility | |||
// and we can't reuse the package | |||
} | |||
public POITextExtractor create(POIFSFileSystem fs) throws IOException { | |||
return create(fs.getRoot(), Biff8EncryptionKey.getCurrentUserPassword()); | |||
} | |||
@Override | |||
public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException { | |||
// First, check for plain OOXML package | |||
if (poifsDir.hasEntry("Package")) { | |||
try (InputStream is = poifsDir.createDocumentInputStream("Package")) { | |||
return create(is, password); | |||
} | |||
} | |||
if (poifsDir.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { | |||
EncryptionInfo ei = new EncryptionInfo(poifsDir); | |||
Decryptor dec = ei.getDecryptor(); | |||
try { | |||
if (!dec.verifyPassword(password)) { | |||
throw new IOException("Invalid password specified"); | |||
} | |||
try (InputStream is = dec.getDataStream(poifsDir)) { | |||
return create(is, password); | |||
} | |||
} catch (IOException e) { | |||
throw e; | |||
} catch (Exception e) { | |||
throw new IOException(e); | |||
} | |||
} | |||
throw new IOException("The OLE2 file neither contained a plain OOXML package node (\"Package\") nor an encrypted one (\"EncryptedPackage\")."); | |||
} | |||
} |
@@ -36,9 +36,10 @@ import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProper | |||
* content of the OOXML file properties, eg author | |||
* and title. | |||
*/ | |||
public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { | |||
public class POIXMLPropertiesTextExtractor implements POIXMLTextExtractor { | |||
private final POIXMLDocument doc; | |||
private final DateFormat dateFormat; | |||
private boolean doCloseFilesystem = true; | |||
/** | |||
* Creates a new POIXMLPropertiesTextExtractor for the given open document. | |||
@@ -46,7 +47,7 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { | |||
* @param doc the given open document | |||
*/ | |||
public POIXMLPropertiesTextExtractor(POIXMLDocument doc) { | |||
super(doc); | |||
this.doc = doc; | |||
DateFormatSymbols dfs = DateFormatSymbols.getInstance(Locale.ROOT); | |||
dateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", dfs); | |||
dateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); | |||
@@ -242,7 +243,7 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { | |||
} | |||
/*else if (property.isSetArray()) { | |||
// TODO Fetch the array values and output | |||
// TODO Fetch the array values and output | |||
} | |||
else if (property.isSetVector()) { | |||
// TODO Fetch the vector values and output | |||
@@ -281,4 +282,24 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { | |||
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { | |||
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!"); | |||
} | |||
@Override | |||
public POIXMLDocument getDocument() { | |||
return doc; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public POIXMLDocument getFilesystem() { | |||
return null; | |||
} | |||
} |
@@ -27,61 +27,48 @@ import org.apache.poi.ooxml.POIXMLProperties.ExtendedProperties; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.openxml4j.util.ZipSecureFile; | |||
public abstract class POIXMLTextExtractor extends POITextExtractor { | |||
/** The POIXMLDocument that's open */ | |||
private final POIXMLDocument _document; | |||
/** | |||
* Creates a new text extractor for the given document | |||
* | |||
* @param document the document to extract from | |||
*/ | |||
public POIXMLTextExtractor(POIXMLDocument document) { | |||
_document = document; | |||
} | |||
public interface POIXMLTextExtractor extends POITextExtractor { | |||
/** | |||
* Returns the core document properties | |||
* | |||
* | |||
* @return the core document properties | |||
*/ | |||
public CoreProperties getCoreProperties() { | |||
return _document.getProperties().getCoreProperties(); | |||
default CoreProperties getCoreProperties() { | |||
return getDocument().getProperties().getCoreProperties(); | |||
} | |||
/** | |||
* Returns the extended document properties | |||
* | |||
* | |||
* @return the extended document properties | |||
*/ | |||
public ExtendedProperties getExtendedProperties() { | |||
return _document.getProperties().getExtendedProperties(); | |||
default ExtendedProperties getExtendedProperties() { | |||
return getDocument().getProperties().getExtendedProperties(); | |||
} | |||
/** | |||
* Returns the custom document properties | |||
* | |||
* | |||
* @return the custom document properties | |||
*/ | |||
public CustomProperties getCustomProperties() { | |||
return _document.getProperties().getCustomProperties(); | |||
default CustomProperties getCustomProperties() { | |||
return getDocument().getProperties().getCustomProperties(); | |||
} | |||
/** | |||
* Returns opened document | |||
* | |||
* | |||
* @return the opened document | |||
*/ | |||
@Override | |||
public final POIXMLDocument getDocument() { | |||
return _document; | |||
} | |||
POIXMLDocument getDocument(); | |||
/** | |||
* Returns the opened OPCPackage that contains the document | |||
* | |||
* | |||
* @return the opened OPCPackage | |||
*/ | |||
public OPCPackage getPackage() { | |||
return _document.getPackage(); | |||
default OPCPackage getPackage() { | |||
POIXMLDocument doc = getDocument(); | |||
return doc != null ? doc.getPackage() : null; | |||
} | |||
/** | |||
@@ -89,25 +76,24 @@ public abstract class POIXMLTextExtractor extends POITextExtractor { | |||
* document properties metadata, such as title and author. | |||
*/ | |||
@Override | |||
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { | |||
return new POIXMLPropertiesTextExtractor(_document); | |||
default POIXMLPropertiesTextExtractor getMetadataTextExtractor() { | |||
return new POIXMLPropertiesTextExtractor(getDocument()); | |||
} | |||
@Override | |||
public void close() throws IOException { | |||
default void close() throws IOException { | |||
// e.g. XSSFEventBaseExcelExtractor passes a null-document | |||
if(_document != null) { | |||
if (isCloseFilesystem()) { | |||
@SuppressWarnings("resource") | |||
OPCPackage pkg = _document.getPackage(); | |||
if(pkg != null) { | |||
OPCPackage pkg = getPackage(); | |||
if (pkg != null) { | |||
// revert the package to not re-write the file, which is very likely not wanted for a TextExtractor! | |||
pkg.revert(); | |||
} | |||
} | |||
super.close(); | |||
} | |||
protected void checkMaxTextSize(CharSequence text, String string) { | |||
default void checkMaxTextSize(CharSequence text, String string) { | |||
if(string == null) { | |||
return; | |||
} |
@@ -18,7 +18,6 @@ package org.apache.poi.xdgf.extractor; | |||
import java.io.IOException; | |||
import org.apache.poi.ooxml.POIXMLDocument; | |||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.xdgf.usermodel.XDGFPage; | |||
@@ -28,12 +27,12 @@ import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor; | |||
/** | |||
* Helper class to extract text from an OOXML Visio File | |||
*/ | |||
public class XDGFVisioExtractor extends POIXMLTextExtractor { | |||
public class XDGFVisioExtractor implements POIXMLTextExtractor { | |||
protected final XmlVisioDocument document; | |||
private boolean doCloseFilesystem = true; | |||
public XDGFVisioExtractor(XmlVisioDocument document) { | |||
super(document); | |||
this.document = document; | |||
} | |||
@@ -43,25 +42,31 @@ public class XDGFVisioExtractor extends POIXMLTextExtractor { | |||
public String getText() { | |||
ShapeTextVisitor visitor = new ShapeTextVisitor(); | |||
for (XDGFPage page: document.getPages()) { | |||
page.getContent().visitShapes(visitor); | |||
} | |||
return visitor.getText(); | |||
} | |||
public static void main(String [] args) throws IOException { | |||
if (args.length < 1) { | |||
System.err.println("Use:"); | |||
System.err.println(" XDGFVisioExtractor <filename.vsdx>"); | |||
System.exit(1); | |||
} | |||
POIXMLTextExtractor extractor = | |||
new XDGFVisioExtractor(POIXMLDocument.openPackage( | |||
args[0] | |||
)); | |||
System.out.println(extractor.getText()); | |||
extractor.close(); | |||
@Override | |||
public XmlVisioDocument getDocument() { | |||
return document; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public XmlVisioDocument getFilesystem() { | |||
return document; | |||
} | |||
} |
@@ -0,0 +1,45 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.xslf.extractor; | |||
import org.apache.poi.ooxml.extractor.POIXMLPropertiesTextExtractor; | |||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; | |||
import org.apache.poi.sl.extractor.SlideShowExtractor; | |||
import org.apache.poi.xslf.usermodel.XMLSlideShow; | |||
import org.apache.poi.xslf.usermodel.XSLFShape; | |||
import org.apache.poi.xslf.usermodel.XSLFTextParagraph; | |||
/** | |||
* Helper class to extract text from an OOXML Powerpoint file | |||
*/ | |||
public class XSLFExtractor extends SlideShowExtractor<XSLFShape, XSLFTextParagraph> implements POIXMLTextExtractor { | |||
public XSLFExtractor(XMLSlideShow slideshow) { | |||
super(slideshow); | |||
} | |||
@Override | |||
public XMLSlideShow getDocument() { | |||
return (XMLSlideShow)slideshow; | |||
} | |||
@Override | |||
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { | |||
return POIXMLTextExtractor.super.getMetadataTextExtractor(); | |||
} | |||
} |
@@ -19,7 +19,6 @@ package org.apache.poi.xssf.extractor; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.ss.usermodel.DataFormatter; | |||
@@ -43,8 +42,7 @@ import org.xml.sax.SAXException; | |||
* | |||
* @since 3.16-beta3 | |||
*/ | |||
public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor | |||
implements org.apache.poi.ss.extractor.ExcelExtractor { | |||
public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor { | |||
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFBEventBasedExcelExtractor.class); | |||
@@ -62,18 +60,6 @@ public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor | |||
super(container); | |||
} | |||
public static void main(String[] args) throws Exception { | |||
if (args.length < 1) { | |||
System.err.println("Use:"); | |||
System.err.println(" XSSFBEventBasedExcelExtractor <filename.xlsb>"); | |||
System.exit(1); | |||
} | |||
POIXMLTextExtractor extractor = | |||
new XSSFBEventBasedExcelExtractor(args[0]); | |||
System.out.println(extractor.getText()); | |||
extractor.close(); | |||
} | |||
public void setHandleHyperlinksInCells(boolean handleHyperlinksInCells) { | |||
this.handleHyperlinksInCells = handleHyperlinksInCells; | |||
} |
@@ -25,6 +25,7 @@ import java.util.Map; | |||
import javax.xml.parsers.ParserConfigurationException; | |||
import org.apache.poi.ooxml.POIXMLDocument; | |||
import org.apache.poi.ooxml.POIXMLProperties; | |||
import org.apache.poi.ooxml.POIXMLProperties.CoreProperties; | |||
import org.apache.poi.ooxml.POIXMLProperties.CustomProperties; | |||
@@ -57,13 +58,13 @@ import org.xml.sax.XMLReader; | |||
* Implementation of a text extractor from OOXML Excel | |||
* files that uses SAX event based parsing. | |||
*/ | |||
public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor | |||
implements org.apache.poi.ss.extractor.ExcelExtractor { | |||
public class XSSFEventBasedExcelExtractor | |||
implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor { | |||
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFEventBasedExcelExtractor.class); | |||
protected OPCPackage container; | |||
protected POIXMLProperties properties; | |||
protected final OPCPackage container; | |||
protected final POIXMLProperties properties; | |||
protected Locale locale; | |||
protected boolean includeTextBoxes = true; | |||
@@ -73,29 +74,17 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor | |||
protected boolean formulasNotResults; | |||
protected boolean concatenatePhoneticRuns = true; | |||
private boolean doCloseFilesystem = true; | |||
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { | |||
this(OPCPackage.open(path)); | |||
} | |||
public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { | |||
super(null); | |||
this.container = container; | |||
properties = new POIXMLProperties(container); | |||
} | |||
public static void main(String[] args) throws Exception { | |||
if (args.length < 1) { | |||
System.err.println("Use:"); | |||
System.err.println(" XSSFEventBasedExcelExtractor <filename.xlsx>"); | |||
System.exit(1); | |||
} | |||
POIXMLTextExtractor extractor = | |||
new XSSFEventBasedExcelExtractor(args[0]); | |||
System.out.println(extractor.getText()); | |||
extractor.close(); | |||
} | |||
/** | |||
* Should sheet names be included? Default is true | |||
*/ | |||
@@ -319,12 +308,23 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor | |||
} | |||
@Override | |||
public void close() throws IOException { | |||
if (container != null) { | |||
container.close(); | |||
container = null; | |||
} | |||
super.close(); | |||
public POIXMLDocument getDocument() { | |||
return null; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public OPCPackage getFilesystem() { | |||
return container; | |||
} | |||
protected class SheetTextExtractor implements SheetContentsHandler { |
@@ -20,8 +20,8 @@ import java.io.IOException; | |||
import java.util.Iterator; | |||
import java.util.Locale; | |||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; | |||
import org.apache.poi.hssf.extractor.ExcelExtractor; | |||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.ss.usermodel.Cell; | |||
@@ -44,8 +44,8 @@ import org.apache.xmlbeans.XmlException; | |||
/** | |||
* Helper class to extract text from an OOXML Excel file | |||
*/ | |||
public class XSSFExcelExtractor extends POIXMLTextExtractor | |||
implements org.apache.poi.ss.extractor.ExcelExtractor { | |||
public class XSSFExcelExtractor | |||
implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor { | |||
public static final XSSFRelation[] SUPPORTED_TYPES = new XSSFRelation[] { | |||
XSSFRelation.WORKBOOK, XSSFRelation.MACRO_TEMPLATE_WORKBOOK, | |||
XSSFRelation.MACRO_ADDIN_WORKBOOK, XSSFRelation.TEMPLATE_WORKBOOK, | |||
@@ -53,34 +53,21 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor | |||
}; | |||
private Locale locale; | |||
private XSSFWorkbook workbook; | |||
private final XSSFWorkbook workbook; | |||
private boolean includeSheetNames = true; | |||
private boolean formulasNotResults; | |||
private boolean includeCellComments; | |||
private boolean includeHeadersFooters = true; | |||
private boolean includeTextBoxes = true; | |||
private boolean doCloseFilesystem = true; | |||
public XSSFExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { | |||
this(new XSSFWorkbook(container)); | |||
} | |||
public XSSFExcelExtractor(XSSFWorkbook workbook) { | |||
super(workbook); | |||
this.workbook = workbook; | |||
} | |||
public static void main(String[] args) throws Exception { | |||
if(args.length < 1) { | |||
System.err.println("Use:"); | |||
System.err.println(" XSSFExcelExtractor <filename.xlsx>"); | |||
System.exit(1); | |||
} | |||
try (OPCPackage pkg = OPCPackage.create(args[0]); | |||
POIXMLTextExtractor extractor = new XSSFExcelExtractor(pkg)) { | |||
System.out.println(extractor.getText()); | |||
} | |||
} | |||
/** | |||
* Should sheet names be included? Default is true | |||
*/ | |||
@@ -194,7 +181,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor | |||
} | |||
text.append("\n"); | |||
} | |||
// add textboxes | |||
if (includeTextBoxes){ | |||
XSSFDrawing drawing = sheet.getDrawingPatriarch(); | |||
@@ -262,4 +249,24 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor | |||
private String extractHeaderFooter(HeaderFooter hf) { | |||
return ExcelExtractor._extractHeaderFooter(hf); | |||
} | |||
@Override | |||
public XSSFWorkbook getDocument() { | |||
return workbook; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public XSSFWorkbook getFilesystem() { | |||
return workbook; | |||
} | |||
} |
@@ -19,9 +19,7 @@ package org.apache.poi.xwpf.extractor; | |||
import java.io.IOException; | |||
import java.util.List; | |||
import org.apache.poi.ooxml.POIXMLDocument; | |||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.xwpf.model.XWPFCommentsDecorator; | |||
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; | |||
@@ -39,46 +37,31 @@ import org.apache.poi.xwpf.usermodel.XWPFSDTCell; | |||
import org.apache.poi.xwpf.usermodel.XWPFTable; | |||
import org.apache.poi.xwpf.usermodel.XWPFTableCell; | |||
import org.apache.poi.xwpf.usermodel.XWPFTableRow; | |||
import org.apache.xmlbeans.XmlException; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; | |||
/** | |||
* Helper class to extract text from an OOXML Word file | |||
*/ | |||
public class XWPFWordExtractor extends POIXMLTextExtractor { | |||
public class XWPFWordExtractor implements POIXMLTextExtractor { | |||
public static final XWPFRelation[] SUPPORTED_TYPES = { | |||
XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE, | |||
XWPFRelation.MACRO_DOCUMENT, | |||
XWPFRelation.MACRO_TEMPLATE_DOCUMENT | |||
}; | |||
private XWPFDocument document; | |||
private final XWPFDocument document; | |||
private boolean fetchHyperlinks; | |||
private boolean concatenatePhoneticRuns = true; | |||
private boolean doCloseFilesystem = true; | |||
public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { | |||
public XWPFWordExtractor(OPCPackage container) throws IOException { | |||
this(new XWPFDocument(container)); | |||
} | |||
public XWPFWordExtractor(XWPFDocument document) { | |||
super(document); | |||
this.document = document; | |||
} | |||
public static void main(String[] args) throws Exception { | |||
if (args.length < 1) { | |||
System.err.println("Use:"); | |||
System.err.println(" XWPFWordExtractor <filename.docx>"); | |||
System.exit(1); | |||
} | |||
POIXMLTextExtractor extractor = | |||
new XWPFWordExtractor(POIXMLDocument.openPackage( | |||
args[0] | |||
)); | |||
System.out.println(extractor.getText()); | |||
extractor.close(); | |||
} | |||
/** | |||
* Should we also fetch the hyperlinks, when fetching | |||
* the text content? Default is to only output the | |||
@@ -217,4 +200,24 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { | |||
text.append(hfPolicy.getDefaultHeader().getText()); | |||
} | |||
} | |||
@Override | |||
public XWPFDocument getDocument() { | |||
return document; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public XWPFDocument getFilesystem() { | |||
return document; | |||
} | |||
} |
@@ -31,23 +31,25 @@ import java.util.Locale; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.UnsupportedFileFormatException; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.POIOLE2TextExtractor; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.hssf.HSSFTestDataSamples; | |||
import org.apache.poi.hssf.OldExcelFormatException; | |||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; | |||
import org.apache.poi.hssf.extractor.ExcelExtractor; | |||
import org.apache.poi.ooxml.extractor.ExtractorFactory; | |||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; | |||
import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.openxml4j.opc.PackageAccess; | |||
import org.apache.poi.poifs.filesystem.FileMagic; | |||
import org.apache.poi.poifs.filesystem.NotOLE2FileException; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; | |||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor; | |||
import org.apache.xmlbeans.XmlException; | |||
import org.junit.Rule; | |||
import org.junit.Test; | |||
import org.junit.rules.ExpectedException; | |||
/** | |||
* Test that the extractor factory plays nicely | |||
@@ -89,6 +91,8 @@ public class TestExtractorFactory { | |||
private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); | |||
private static File pub = getFileAndCheck(pubTests, "Simple.pub"); | |||
private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory(); | |||
private static File getFileAndCheck(POIDataSamples samples, String name) { | |||
File file = samples.getFile(name); | |||
@@ -110,7 +114,7 @@ public class TestExtractorFactory { | |||
"Word 6", doc6, "Word6Extractor", 20, | |||
"Word 95", doc95, "Word6Extractor", 120, | |||
"PowerPoint", ppt, "SlideShowExtractor", 120, | |||
"PowerPoint - pptx", pptx, "SlideShowExtractor", 120, | |||
"PowerPoint - pptx", pptx, "XSLFExtractor", 120, | |||
"Visio", vsd, "VisioTextExtractor", 50, | |||
"Visio - vsdx", vsdx, "XDGFVisioExtractor", 20, | |||
"Publisher", pub, "PublisherTextExtractor", 50, | |||
@@ -125,6 +129,8 @@ public class TestExtractorFactory { | |||
R apply(T t) throws IOException, OpenXML4JException, XmlException; | |||
} | |||
@Rule | |||
public ExpectedException thrown = ExpectedException.none(); | |||
@Test | |||
public void testFile() throws Exception { | |||
@@ -135,12 +141,12 @@ public class TestExtractorFactory { | |||
} | |||
} | |||
@Test(expected = IllegalArgumentException.class) | |||
@Test | |||
public void testFileInvalid() throws Exception { | |||
thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN"); | |||
thrown.expect(IOException.class); | |||
// Text | |||
try (POITextExtractor ignored = ExtractorFactory.createExtractor(txt)) { | |||
fail("extracting from invalid package"); | |||
} | |||
ExtractorFactory.createExtractor(txt); | |||
} | |||
@Test | |||
@@ -148,8 +154,10 @@ public class TestExtractorFactory { | |||
testStream(ExtractorFactory::createExtractor, true); | |||
} | |||
@Test(expected = IllegalArgumentException.class) | |||
@Test | |||
public void testInputStreamInvalid() throws Exception { | |||
thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN"); | |||
thrown.expect(IOException.class); | |||
testInvalid(ExtractorFactory::createExtractor); | |||
} | |||
@@ -158,8 +166,10 @@ public class TestExtractorFactory { | |||
testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false); | |||
} | |||
@Test(expected = IOException.class) | |||
@Test | |||
public void testPOIFSInvalid() throws Exception { | |||
thrown.expectMessage("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0"); | |||
thrown.expect(NotOLE2FileException.class); | |||
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f))); | |||
} | |||
@@ -195,9 +205,7 @@ public class TestExtractorFactory { | |||
POITextExtractor ignored = poifs.apply(fis)) { | |||
fail("extracting from invalid package"); | |||
} catch (IllegalArgumentException e) { | |||
assertTrue("Had: " + e, | |||
e.getMessage().contains(FileMagic.UNKNOWN.name())); | |||
assertTrue("Had: " + e, e.getMessage().contains(FileMagic.UNKNOWN.name())); | |||
throw e; | |||
} | |||
} | |||
@@ -211,7 +219,7 @@ public class TestExtractorFactory { | |||
} | |||
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ); | |||
final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) { | |||
final POITextExtractor ext = xmlFactory.create(pkg)) { | |||
testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]); | |||
pkg.revert(); | |||
} | |||
@@ -222,7 +230,7 @@ public class TestExtractorFactory { | |||
public void testPackageInvalid() throws Exception { | |||
// Text | |||
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ); | |||
final POITextExtractor ignored = ExtractorFactory.createExtractor(pkg)) { | |||
final POITextExtractor ignored = xmlFactory.create(pkg)) { | |||
fail("extracting from invalid package"); | |||
} | |||
} | |||
@@ -251,61 +259,45 @@ public class TestExtractorFactory { | |||
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); | |||
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); | |||
try { | |||
// Check we get the right extractors now | |||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { | |||
assertTrue(extractor instanceof EventBasedExcelExtractor); | |||
} | |||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { | |||
assertTrue(extractor.getText().length() > 200); | |||
} | |||
try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) { | |||
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); | |||
} | |||
try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) { | |||
assertTrue(extractor.getText().length() > 200); | |||
} | |||
} finally { | |||
// Put back to normal | |||
ExtractorFactory.setThreadPrefersEventExtractors(false); | |||
} | |||
// Check we get the right extractors now | |||
POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); | |||
assertTrue( | |||
extractor | |||
instanceof EventBasedExcelExtractor | |||
); | |||
extractor.close(); | |||
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); | |||
assertTrue( | |||
extractor.getText().length() > 200 | |||
); | |||
extractor.close(); | |||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); | |||
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); | |||
extractor.close(); | |||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); | |||
assertTrue( | |||
extractor.getText().length() > 200 | |||
); | |||
extractor.close(); | |||
// Put back to normal | |||
ExtractorFactory.setThreadPrefersEventExtractors(false); | |||
assertFalse(ExtractorFactory.getPreferEventExtractor()); | |||
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); | |||
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); | |||
// And back | |||
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); | |||
assertTrue( | |||
extractor | |||
instanceof ExcelExtractor | |||
); | |||
extractor.close(); | |||
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); | |||
assertTrue( | |||
extractor.getText().length() > 200 | |||
); | |||
extractor.close(); | |||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); | |||
assertTrue( | |||
extractor | |||
instanceof XSSFExcelExtractor | |||
); | |||
extractor.close(); | |||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); | |||
assertTrue( | |||
extractor.getText().length() > 200 | |||
); | |||
extractor.close(); | |||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { | |||
assertTrue(extractor instanceof ExcelExtractor); | |||
} | |||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { | |||
assertTrue(extractor.getText().length() > 200); | |||
} | |||
try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) { | |||
assertTrue(extractor instanceof XSSFExcelExtractor); | |||
} | |||
try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) { | |||
assertTrue(extractor.getText().length() > 200); | |||
} | |||
} | |||
/** | |||
@@ -325,7 +317,7 @@ public class TestExtractorFactory { | |||
}; | |||
for (int i=0; i<testObj.length; i+=3) { | |||
try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) { | |||
try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor)ExtractorFactory.createExtractor((File)testObj[i+1])) { | |||
final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext); | |||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0; | |||
@@ -443,13 +435,13 @@ public class TestExtractorFactory { | |||
"spreadsheet/WithChartSheet.xlsx", | |||
"spreadsheet/chart_sheet.xlsx", | |||
}; | |||
@Test | |||
public void testFileLeak() { | |||
// run a number of files that might fail in order to catch | |||
// run a number of files that might fail in order to catch | |||
// leaked file resources when using file-leak-detector while | |||
// running the test | |||
for(String file : EXPECTED_FAILURES) { | |||
try { | |||
ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file)); | |||
@@ -458,21 +450,22 @@ public class TestExtractorFactory { | |||
} | |||
} | |||
} | |||
/** | |||
* #59074 - Excel 95 files should give a helpful message, not just | |||
* #59074 - Excel 95 files should give a helpful message, not just | |||
* "No supported documents found in the OLE2 stream" | |||
*/ | |||
@Test(expected = OldExcelFormatException.class) | |||
public void bug59074() throws Exception { | |||
ExtractorFactory.createExtractor( | |||
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls")); | |||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"))) { | |||
String text = extractor.getText(); | |||
assertContains(text, "testdoc"); | |||
} | |||
} | |||
@Test(expected = IllegalStateException.class) | |||
public void testGetEmbeddedFromXMLExtractor() { | |||
public void testGetEmbeddedFromXMLExtractor() throws IOException { | |||
// currently not implemented | |||
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null); | |||
ExtractorFactory.getEmbeddedDocsTextExtractors(null); | |||
} | |||
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed. |
@@ -60,9 +60,9 @@ import org.apache.poi.EncryptedDocumentException; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.POITestCase; | |||
import org.apache.poi.UnsupportedFileFormatException; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.ooxml.POIXMLException; | |||
import org.apache.poi.ooxml.extractor.ExtractorFactory; | |||
import org.apache.poi.ooxml.util.DocumentHelper; | |||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; | |||
import org.apache.poi.openxml4j.exceptions.InvalidOperationException; | |||
@@ -836,7 +836,7 @@ public final class TestPackage { | |||
@Test | |||
public void testZipEntityExpansionExceedsMemory() throws IOException, OpenXML4JException, XmlException { | |||
expectedEx.expect(POIXMLException.class); | |||
expectedEx.expect(IOException.class); | |||
expectedEx.expectMessage("unable to parse shared strings table"); | |||
expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than")); | |||
openXmlBombFile("poc-xmlbomb.xlsx"); | |||
@@ -844,7 +844,7 @@ public final class TestPackage { | |||
@Test | |||
public void testZipEntityExpansionExceedsMemory2() throws IOException, OpenXML4JException, XmlException { | |||
expectedEx.expect(POIXMLException.class); | |||
expectedEx.expect(IOException.class); | |||
expectedEx.expectMessage("unable to parse shared strings table"); | |||
expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than")); | |||
openXmlBombFile("poc-xmlbomb-empty.xlsx"); |
@@ -35,14 +35,12 @@ import java.util.Collection; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.POIDocument; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; | |||
import org.apache.poi.ooxml.extractor.ExtractorFactory; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.poifs.crypt.EncryptionInfo; | |||
import org.apache.poi.poifs.crypt.cryptoapi.CryptoAPIEncryptionHeader; | |||
import org.apache.poi.poifs.storage.RawDataUtil; | |||
import org.apache.xmlbeans.XmlException; | |||
import org.junit.Test; | |||
import org.junit.runner.RunWith; | |||
import org.junit.runners.Parameterized; | |||
@@ -91,7 +89,7 @@ public class TestHxxFEncryption { | |||
} | |||
@Test | |||
public void extract() throws IOException, OpenXML4JException, XmlException { | |||
public void extract() throws IOException { | |||
File f = sampleDir.getFile(file); | |||
Biff8EncryptionKey.setCurrentUserPassword(password); | |||
try (POITextExtractor te = ExtractorFactory.createExtractor(f)) { | |||
@@ -103,16 +101,16 @@ public class TestHxxFEncryption { | |||
} | |||
@Test | |||
public void changePassword() throws IOException, OpenXML4JException, XmlException { | |||
public void changePassword() throws IOException { | |||
newPassword("test"); | |||
} | |||
@Test | |||
public void removePassword() throws IOException, OpenXML4JException, XmlException { | |||
public void removePassword() throws IOException { | |||
newPassword(null); | |||
} | |||
private void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException { | |||
private void newPassword(String newPass) throws IOException { | |||
File f = sampleDir.getFile(file); | |||
Biff8EncryptionKey.setCurrentUserPassword(password); | |||
try (POITextExtractor te1 = ExtractorFactory.createExtractor(f)) { | |||
@@ -133,7 +131,7 @@ public class TestHxxFEncryption { | |||
/** changing the encryption mode and key size in poor mans style - see comments below */ | |||
@Test | |||
public void changeEncryption() throws IOException, OpenXML4JException, XmlException { | |||
public void changeEncryption() throws IOException { | |||
File f = sampleDir.getFile(file); | |||
ByteArrayOutputStream bos = new ByteArrayOutputStream(); | |||
Biff8EncryptionKey.setCurrentUserPassword(password); | |||
@@ -157,7 +155,7 @@ public class TestHxxFEncryption { | |||
POIDocument doc = (POIDocument) te3.getDocument()) { | |||
// need to cache data (i.e. read all data) before changing the key size | |||
Class<?> clazz = doc.getClass(); | |||
if ("HSLFSlideShowImpl".equals(clazz.getSimpleName())) { | |||
if ("HSLFSlideShow".equals(clazz.getSimpleName())) { | |||
try { | |||
clazz.getDeclaredMethod("getPictureData").invoke(doc); | |||
} catch (ReflectiveOperationException e) { |
@@ -522,7 +522,7 @@ public class TestXSLFBugs { | |||
private String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException { | |||
try (SlideShowExtractor<XSLFShape,XSLFTextParagraph> extr = new SlideShowExtractor<>(ppt)) { | |||
// do not auto-close the slideshow | |||
extr.setFilesystem(null); | |||
extr.setCloseFilesystem(false); | |||
extr.setSlidesByDefault(true); | |||
extr.setNotesByDefault(false); | |||
extr.setMasterByDefault(false); |
@@ -29,20 +29,18 @@ import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.ooxml.extractor.ExtractorFactory; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.sl.extractor.SlideShowExtractor; | |||
import org.apache.poi.xslf.usermodel.XMLSlideShow; | |||
import org.apache.poi.xslf.usermodel.XSLFShape; | |||
import org.apache.poi.xslf.usermodel.XSLFTextParagraph; | |||
import org.apache.xmlbeans.XmlException; | |||
import org.junit.Test; | |||
/** | |||
* Tests for XSLFPowerPointExtractor | |||
*/ | |||
public class TestXSLFPowerPointExtractor { | |||
private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); | |||
private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); | |||
/** | |||
* Get text out of the simple file | |||
@@ -262,10 +260,11 @@ public class TestXSLFPowerPointExtractor { | |||
} | |||
@Test | |||
public void test45541() throws IOException, OpenXML4JException, XmlException { | |||
public void test45541() throws IOException { | |||
// extract text from a powerpoint that has a header in the notes-element | |||
final File headerFile = slTests.getFile("45541_Header.pptx"); | |||
try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) { | |||
//noinspection rawtypes | |||
try (final SlideShowExtractor extr = (SlideShowExtractor) ExtractorFactory.createExtractor(headerFile)) { | |||
String text = extr.getText(); | |||
assertNotNull(text); | |||
assertFalse("Had: " + text, text.contains("testdoc")); | |||
@@ -280,7 +279,8 @@ public class TestXSLFPowerPointExtractor { | |||
// extract text from a powerpoint that has a footer in the master-slide | |||
final File footerFile = slTests.getFile("45541_Footer.pptx"); | |||
try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) { | |||
//noinspection rawtypes | |||
try (SlideShowExtractor extr = (SlideShowExtractor)ExtractorFactory.createExtractor(footerFile)) { | |||
String text = extr.getText(); | |||
assertNotContained(text, "testdoc"); | |||
@@ -16,7 +16,7 @@ | |||
==================================================================== */ | |||
package org.apache.poi.xssf.extractor; | |||
import org.apache.poi.ooxml.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.hssf.HSSFTestDataSamples; | |||
import org.junit.After; | |||
@@ -27,7 +27,7 @@ public class TestXSSFEventBasedExcelExtractorUsingFactory extends TestXSSFEventB | |||
ExtractorFactory.setAllThreadsPreferEventExtractors(true); | |||
return (XSSFEventBasedExcelExtractor) ExtractorFactory.createExtractor(HSSFTestDataSamples.openSampleFileStream(sampleName)); | |||
} | |||
@After | |||
public void tearDown() { | |||
// reset setting to not affect other tests |
@@ -17,8 +17,8 @@ | |||
package org.apache.poi.xssf.extractor; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.hssf.HSSFTestDataSamples; | |||
import org.apache.poi.ooxml.extractor.ExtractorFactory; | |||
import org.junit.After; | |||
/** |
@@ -0,0 +1,18 @@ | |||
# ==================================================================== | |||
# Licensed to the Apache Software Foundation (ASF) under one or more | |||
# contributor license agreements. See the NOTICE file distributed with | |||
# this work for additional information regarding copyright ownership. | |||
# The ASF licenses this file to You under the Apache License, Version 2.0 | |||
# (the "License"); you may not use this file except in compliance with | |||
# the License. You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
# ==================================================================== | |||
org.apache.poi.extractor.MainExtractorFactory |
@@ -0,0 +1,18 @@ | |||
# ==================================================================== | |||
# Licensed to the Apache Software Foundation (ASF) under one or more | |||
# contributor license agreements. See the NOTICE file distributed with | |||
# this work for additional information regarding copyright ownership. | |||
# The ASF licenses this file to You under the Apache License, Version 2.0 | |||
# (the "License"); you may not use this file except in compliance with | |||
# the License. You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
# ==================================================================== | |||
org.apache.poi.ooxml.extractor.POIXMLExtractorFactory |
@@ -0,0 +1,18 @@ | |||
# ==================================================================== | |||
# Licensed to the Apache Software Foundation (ASF) under one or more | |||
# contributor license agreements. See the NOTICE file distributed with | |||
# this work for additional information regarding copyright ownership. | |||
# The ASF licenses this file to You under the Apache License, Version 2.0 | |||
# (the "License"); you may not use this file except in compliance with | |||
# the License. You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
# ==================================================================== | |||
org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory |
@@ -17,44 +17,66 @@ | |||
package org.apache.poi.extractor.ole2; | |||
import java.io.ByteArrayInputStream; | |||
import java.io.File; | |||
import java.io.FileNotFoundException; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.util.Iterator; | |||
import java.util.List; | |||
import java.util.stream.StreamSupport; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.ExtractorProvider; | |||
import org.apache.poi.extractor.POIOLE2TextExtractor; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.extractor.OLE2ExtractorFactory; | |||
import org.apache.poi.hdgf.extractor.VisioTextExtractor; | |||
import org.apache.poi.hpbf.extractor.PublisherTextExtractor; | |||
import org.apache.poi.hslf.usermodel.HSLFShape; | |||
import org.apache.poi.hslf.usermodel.HSLFSlideShow; | |||
import org.apache.poi.hslf.usermodel.HSLFTextParagraph; | |||
import org.apache.poi.hsmf.MAPIMessage; | |||
import org.apache.poi.hsmf.datatypes.AttachmentChunks; | |||
import org.apache.poi.hsmf.extractor.OutlookTextExtractor; | |||
import org.apache.poi.hssf.extractor.ExcelExtractor; | |||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; | |||
import org.apache.poi.hwpf.OldWordFileFormatException; | |||
import org.apache.poi.hwpf.extractor.Word6Extractor; | |||
import org.apache.poi.hwpf.extractor.WordExtractor; | |||
import org.apache.poi.poifs.filesystem.DirectoryEntry; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.Entry; | |||
import org.apache.poi.poifs.filesystem.FileMagic; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.sl.extractor.SlideShowExtractor; | |||
import org.apache.poi.sl.usermodel.SlideShowFactory; | |||
import org.apache.poi.util.POILogFactory; | |||
import org.apache.poi.util.POILogger; | |||
/** | |||
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and | |||
* Scratchpad-specific logic for {@link ExtractorFactory} and | |||
* {@link org.apache.poi.extractor.ExtractorFactory}, which permit the other two to run with | |||
* no Scratchpad jar (though without functionality!) | |||
* <p>Note - should not be used standalone, always use via the other | |||
* two classes</p> | |||
*/ | |||
@SuppressWarnings("WeakerAccess") | |||
public class OLE2ScratchpadExtractorFactory { | |||
public class OLE2ScratchpadExtractorFactory implements ExtractorProvider { | |||
private static final POILogger logger = POILogFactory.getLogger(OLE2ScratchpadExtractorFactory.class); | |||
@Override | |||
public boolean accepts(FileMagic fm) { | |||
return FileMagic.OLE2 == fm; | |||
} | |||
@Override | |||
public POITextExtractor create(File file, String password) throws IOException { | |||
return create(new POIFSFileSystem(file, true).getRoot(), password); | |||
} | |||
@Override | |||
public POITextExtractor create(InputStream inputStream, String password) throws IOException { | |||
return create(new POIFSFileSystem(inputStream).getRoot(), password); | |||
} | |||
/** | |||
* Look for certain entries in the stream, to figure it | |||
* out what format is desired | |||
@@ -66,48 +88,54 @@ public class OLE2ScratchpadExtractorFactory { | |||
* | |||
* @throws IOException when the format specific extraction fails because of invalid entires | |||
*/ | |||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException { | |||
if (poifsDir.hasEntry("WordDocument")) { | |||
// Old or new style word document? | |||
try { | |||
return new WordExtractor(poifsDir); | |||
} catch (OldWordFileFormatException e) { | |||
return new Word6Extractor(poifsDir); | |||
public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException { | |||
final String oldPW = Biff8EncryptionKey.getCurrentUserPassword(); | |||
try { | |||
Biff8EncryptionKey.setCurrentUserPassword(password); | |||
if (poifsDir.hasEntry("WordDocument")) { | |||
// Old or new style word document? | |||
try { | |||
return new WordExtractor(poifsDir); | |||
} catch (OldWordFileFormatException e) { | |||
return new Word6Extractor(poifsDir); | |||
} | |||
} | |||
} | |||
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { | |||
return new SlideShowExtractor(SlideShowFactory.create(poifsDir)); | |||
} | |||
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { | |||
return new SlideShowExtractor<HSLFShape, HSLFTextParagraph>(SlideShowFactory.create(poifsDir)); | |||
} | |||
if (poifsDir.hasEntry("VisioDocument")) { | |||
return new VisioTextExtractor(poifsDir); | |||
} | |||
if (poifsDir.hasEntry("VisioDocument")) { | |||
return new VisioTextExtractor(poifsDir); | |||
} | |||
if (poifsDir.hasEntry("Quill")) { | |||
return new PublisherTextExtractor(poifsDir); | |||
} | |||
if (poifsDir.hasEntry("Quill")) { | |||
return new PublisherTextExtractor(poifsDir); | |||
} | |||
final String[] outlookEntryNames = new String[] { | |||
// message bodies, saved as plain text (PtypString) | |||
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) | |||
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry | |||
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx | |||
// @see org.apache.poi.hsmf.Types.MAPIType | |||
"__substg1.0_1000001E", //PidTagBody ASCII | |||
"__substg1.0_1000001F", //PidTagBody Unicode | |||
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII | |||
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode | |||
"__substg1.0_0037001E", //PidTagSubject ASCII | |||
"__substg1.0_0037001F", //PidTagSubject Unicode | |||
}; | |||
for (String entryName : outlookEntryNames) { | |||
if (poifsDir.hasEntry(entryName)) { | |||
return new OutlookTextExtractor(poifsDir); | |||
final String[] outlookEntryNames = new String[]{ | |||
// message bodies, saved as plain text (PtypString) | |||
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) | |||
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry | |||
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx | |||
// @see org.apache.poi.hsmf.Types.MAPIType | |||
"__substg1.0_1000001E", //PidTagBody ASCII | |||
"__substg1.0_1000001F", //PidTagBody Unicode | |||
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII | |||
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode | |||
"__substg1.0_0037001E", //PidTagSubject ASCII | |||
"__substg1.0_0037001F", //PidTagSubject Unicode | |||
}; | |||
for (String entryName : outlookEntryNames) { | |||
if (poifsDir.hasEntry(entryName)) { | |||
return new OutlookTextExtractor(poifsDir); | |||
} | |||
} | |||
} finally { | |||
Biff8EncryptionKey.setCurrentUserPassword(oldPW); | |||
} | |||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); | |||
return null; | |||
} | |||
/** | |||
@@ -120,10 +148,9 @@ public class OLE2ScratchpadExtractorFactory { | |||
* @param ext the extractor holding the directory to start parsing | |||
* @param dirs a list to be filled with directory references holding embedded | |||
* @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries | |||
* | |||
* @throws IOException when the format specific extraction fails because of invalid entires | |||
*/ | |||
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException { | |||
@Override | |||
public void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) { | |||
// Find all the embedded directories | |||
DirectoryEntry root = ext.getRoot(); | |||
if (root == null) { | |||
@@ -132,25 +159,16 @@ public class OLE2ScratchpadExtractorFactory { | |||
if (ext instanceof ExcelExtractor) { | |||
// These are in MBD... under the root | |||
Iterator<Entry> it = root.getEntries(); | |||
while (it.hasNext()) { | |||
Entry entry = it.next(); | |||
if (entry.getName().startsWith("MBD")) { | |||
dirs.add(entry); | |||
} | |||
} | |||
StreamSupport.stream(root.spliterator(), false) | |||
.filter(entry -> entry.getName().startsWith("MBD")) | |||
.forEach(dirs::add); | |||
} else if (ext instanceof WordExtractor) { | |||
// These are in ObjectPool -> _... under the root | |||
try { | |||
DirectoryEntry op = (DirectoryEntry) | |||
root.getEntry("ObjectPool"); | |||
Iterator<Entry> it = op.getEntries(); | |||
while(it.hasNext()) { | |||
Entry entry = it.next(); | |||
if(entry.getName().startsWith("_")) { | |||
dirs.add(entry); | |||
} | |||
} | |||
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); | |||
StreamSupport.stream(op.spliterator(), false) | |||
.filter(entry -> entry.getName().startsWith("_")) | |||
.forEach(dirs::add); | |||
} catch(FileNotFoundException e) { | |||
logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage()); | |||
// ignored here |
@@ -17,7 +17,6 @@ | |||
package org.apache.poi.hdgf.extractor; | |||
import java.io.FileInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.util.ArrayList; | |||
@@ -38,11 +37,11 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
* Can operate on the command line (outputs to stdout), or | |||
* can return the text for you (example: for use with Lucene). | |||
*/ | |||
public final class VisioTextExtractor extends POIOLE2TextExtractor { | |||
public final class VisioTextExtractor implements POIOLE2TextExtractor { | |||
private HDGFDiagram hdgf; | |||
private boolean doCloseFilesystem = true; | |||
public VisioTextExtractor(HDGFDiagram hdgf) { | |||
super(hdgf); | |||
this.hdgf = hdgf; | |||
} | |||
public VisioTextExtractor(POIFSFileSystem fs) throws IOException { | |||
@@ -91,9 +90,7 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor { | |||
// Capture the text, as long as it isn't | |||
// simply an empty string | |||
String str = cmd.getValue().toString(); | |||
if(str.isEmpty() || "\n".equals(str)) { | |||
// Ignore empty strings | |||
} else { | |||
if (!(str.isEmpty() || "\n".equals(str))) { | |||
text.add( str ); | |||
} | |||
} | |||
@@ -121,21 +118,23 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor { | |||
return text.toString(); | |||
} | |||
public static void main(String[] args) throws Exception { | |||
if(args.length == 0) { | |||
System.err.println("Use:"); | |||
System.err.println(" VisioTextExtractor <file.vsd>"); | |||
System.exit(1); | |||
} | |||
@Override | |||
public HDGFDiagram getDocument() { | |||
return hdgf; | |||
} | |||
try (FileInputStream fis = new FileInputStream(args[0])) { | |||
VisioTextExtractor extractor = | |||
new VisioTextExtractor(fis); | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
// Print not PrintLn as already has \n added to it | |||
System.out.print(extractor.getText()); | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
extractor.close(); | |||
} | |||
@Override | |||
public HDGFDiagram getFilesystem() { | |||
return hdgf; | |||
} | |||
} |
@@ -17,35 +17,37 @@ | |||
package org.apache.poi.hpbf.extractor; | |||
import java.io.FileInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.extractor.POIOLE2TextExtractor; | |||
import org.apache.poi.hpbf.HPBFDocument; | |||
import org.apache.poi.hpbf.model.qcbits.QCBit; | |||
import org.apache.poi.hpbf.model.qcbits.QCTextBit; | |||
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12; | |||
import org.apache.poi.hpbf.model.qcbits.QCTextBit; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
/** | |||
* Extract text from HPBF Publisher files | |||
*/ | |||
public final class PublisherTextExtractor extends POIOLE2TextExtractor { | |||
private HPBFDocument doc; | |||
public final class PublisherTextExtractor implements POIOLE2TextExtractor { | |||
private final HPBFDocument doc; | |||
private boolean hyperlinksByDefault; | |||
private boolean doCloseFilesystem = true; | |||
public PublisherTextExtractor(HPBFDocument doc) { | |||
super(doc); | |||
this.doc = doc; | |||
} | |||
public PublisherTextExtractor(DirectoryNode dir) throws IOException { | |||
this(new HPBFDocument(dir)); | |||
} | |||
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException { | |||
this(new HPBFDocument(fs)); | |||
} | |||
public PublisherTextExtractor(InputStream is) throws IOException { | |||
this(new POIFSFileSystem(is)); | |||
} | |||
@@ -66,7 +68,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { | |||
// Get the text from the Quill Contents | |||
QCBit[] bits = doc.getQuillContents().getBits(); | |||
for (QCBit bit1 : bits) { | |||
if (bit1 != null && bit1 instanceof QCTextBit) { | |||
if (bit1 instanceof QCTextBit) { | |||
QCTextBit t = (QCTextBit) bit1; | |||
text.append(t.getText().replace('\r', '\n')); | |||
} | |||
@@ -79,7 +81,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { | |||
// how to tie that together. | |||
if(hyperlinksByDefault) { | |||
for (QCBit bit : bits) { | |||
if (bit != null && bit instanceof Type12) { | |||
if (bit instanceof Type12) { | |||
Type12 hyperlinks = (Type12) bit; | |||
for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) { | |||
text.append("<"); | |||
@@ -96,19 +98,23 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { | |||
return text.toString(); | |||
} | |||
@Override | |||
public HPBFDocument getDocument() { | |||
return doc; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
public static void main(String[] args) throws Exception { | |||
if(args.length == 0) { | |||
System.err.println("Use:"); | |||
System.err.println(" PublisherTextExtractor <file.pub>"); | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
for (String arg : args) { | |||
try (FileInputStream fis = new FileInputStream(arg)) { | |||
PublisherTextExtractor te = new PublisherTextExtractor(fis); | |||
System.out.println(te.getText()); | |||
te.close(); | |||
} | |||
} | |||
@Override | |||
public HPBFDocument getFilesystem() { | |||
return doc; | |||
} | |||
} |
@@ -1,279 +0,0 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hslf.extractor; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.util.List; | |||
import org.apache.poi.EncryptedDocumentException; | |||
import org.apache.poi.extractor.POIOLE2TextExtractor; | |||
import org.apache.poi.hslf.usermodel.HSLFObjectShape; | |||
import org.apache.poi.hslf.usermodel.HSLFShape; | |||
import org.apache.poi.hslf.usermodel.HSLFSlideShow; | |||
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl; | |||
import org.apache.poi.hslf.usermodel.HSLFTextParagraph; | |||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.sl.extractor.SlideShowExtractor; | |||
import org.apache.poi.sl.usermodel.SlideShow; | |||
import org.apache.poi.sl.usermodel.SlideShowFactory; | |||
import org.apache.poi.util.Removal; | |||
/** | |||
* This class can be used to extract text from a PowerPoint file. Can optionally | |||
* also get the notes from one. | |||
* | |||
* @deprecated in POI 4.0.0, use {@link SlideShowExtractor} instead | |||
*/ | |||
@SuppressWarnings("WeakerAccess") | |||
@Deprecated | |||
@Removal(version="5.0.0") | |||
public final class PowerPointExtractor extends POIOLE2TextExtractor { | |||
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate; | |||
private boolean slidesByDefault = true; | |||
private boolean notesByDefault; | |||
private boolean commentsByDefault; | |||
private boolean masterByDefault; | |||
/** | |||
* Basic extractor. Returns all the text, and optionally all the notes | |||
*/ | |||
public static void main(String[] args) throws IOException { | |||
if (args.length < 1) { | |||
System.err.println("Usage:"); | |||
System.err.println("\tPowerPointExtractor [-notes] <file>"); | |||
System.exit(1); | |||
} | |||
boolean notes = false; | |||
boolean comments = false; | |||
boolean master = true; | |||
String file; | |||
if (args.length > 1) { | |||
notes = true; | |||
file = args[1]; | |||
if (args.length > 2) { | |||
comments = true; | |||
} | |||
} else { | |||
file = args[0]; | |||
} | |||
try (PowerPointExtractor ppe = new PowerPointExtractor(file)) { | |||
System.out.println(ppe.getText(true, notes, comments, master)); | |||
} | |||
} | |||
public PowerPointExtractor(final HSLFSlideShow slideShow) { | |||
super(slideShow.getSlideShowImpl()); | |||
setFilesystem(slideShow); | |||
delegate = new SlideShowExtractor<>(slideShow); | |||
} | |||
/** | |||
* Creates a PowerPointExtractor, from a file | |||
* | |||
* @param fileName The name of the file to extract from | |||
*/ | |||
public PowerPointExtractor(String fileName) throws IOException { | |||
this(createHSLF(new File(fileName), Biff8EncryptionKey.getCurrentUserPassword(), true)); | |||
} | |||
/** | |||
* Creates a PowerPointExtractor, from an Input Stream | |||
* | |||
* @param iStream The input stream containing the PowerPoint document | |||
*/ | |||
public PowerPointExtractor(InputStream iStream) throws IOException { | |||
this(createHSLF(iStream, Biff8EncryptionKey.getCurrentUserPassword())); | |||
} | |||
/** | |||
* Creates a PowerPointExtractor, from an open POIFSFileSystem | |||
* | |||
* @param fs the POIFSFileSystem containing the PowerPoint document | |||
*/ | |||
public PowerPointExtractor(POIFSFileSystem fs) throws IOException { | |||
this(createHSLF(fs, Biff8EncryptionKey.getCurrentUserPassword())); | |||
} | |||
/** | |||
* Creates a PowerPointExtractor, from a specific place | |||
* inside an open {@link POIFSFileSystem} | |||
* | |||
* @param dir the POIFS Directory containing the PowerPoint document | |||
*/ | |||
public PowerPointExtractor(DirectoryNode dir) throws IOException { | |||
this(new HSLFSlideShow(dir)); | |||
} | |||
/** | |||
* Creates a PowerPointExtractor, from a HSLFSlideShow | |||
* | |||
* @param ss the HSLFSlideShow to extract text from | |||
*/ | |||
public PowerPointExtractor(HSLFSlideShowImpl ss) { | |||
this(new HSLFSlideShow(ss)); | |||
} | |||
/** | |||
* Should a call to getText() return slide text? Default is yes | |||
*/ | |||
public void setSlidesByDefault(final boolean slidesByDefault) { | |||
this.slidesByDefault = slidesByDefault; | |||
delegate.setSlidesByDefault(slidesByDefault); | |||
} | |||
/** | |||
* Should a call to getText() return notes text? Default is no | |||
*/ | |||
public void setNotesByDefault(final boolean notesByDefault) { | |||
this.notesByDefault = notesByDefault; | |||
delegate.setNotesByDefault(notesByDefault); | |||
} | |||
/** | |||
* Should a call to getText() return comments text? Default is no | |||
*/ | |||
public void setCommentsByDefault(final boolean commentsByDefault) { | |||
this.commentsByDefault = commentsByDefault; | |||
delegate.setCommentsByDefault(commentsByDefault); | |||
} | |||
/** | |||
* Should a call to getText() return text from master? Default is no | |||
*/ | |||
public void setMasterByDefault(final boolean masterByDefault) { | |||
this.masterByDefault = masterByDefault; | |||
delegate.setMasterByDefault(masterByDefault); | |||
} | |||
/** | |||
* Fetches all the slide text from the slideshow, but not the notes, unless | |||
* you've called setSlidesByDefault() and setNotesByDefault() to change this | |||
*/ | |||
@Override | |||
public String getText() { | |||
return delegate.getText(); | |||
} | |||
/** | |||
* Fetches text from the slideshow, be it slide text or note text. Because | |||
* the final block of text in a TextRun normally have their last \n | |||
* stripped, we add it back | |||
* | |||
* @param getSlideText fetch slide text | |||
* @param getNoteText fetch note text | |||
*/ | |||
public String getText(boolean getSlideText, boolean getNoteText) { | |||
return getText(getSlideText,getNoteText,commentsByDefault,masterByDefault); | |||
} | |||
public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) { | |||
delegate.setSlidesByDefault(getSlideText); | |||
delegate.setNotesByDefault(getNoteText); | |||
delegate.setCommentsByDefault(getCommentText); | |||
delegate.setMasterByDefault(getMasterText); | |||
try { | |||
return delegate.getText(); | |||
} finally { | |||
delegate.setSlidesByDefault(slidesByDefault); | |||
delegate.setNotesByDefault(notesByDefault); | |||
delegate.setCommentsByDefault(commentsByDefault); | |||
delegate.setMasterByDefault(masterByDefault); | |||
} | |||
} | |||
/** | |||
* Fetches all the notes text from the slideshow, but not the slide text | |||
*/ | |||
public String getNotes() { | |||
return getText(false, true, false, false); | |||
} | |||
@SuppressWarnings("unchecked") | |||
public List<HSLFObjectShape> getOLEShapes() { | |||
return (List<HSLFObjectShape>)delegate.getOLEShapes(); | |||
} | |||
/** | |||
* Helper method to avoid problems with compiling code in Eclipse | |||
* | |||
* Eclipse javac has some bugs with complex casts, this method tries | |||
* to work around this. | |||
* | |||
* @param fs The {@link POIFSFileSystem} to read the document from | |||
* @param password The password that should be used or null if no password is necessary. | |||
* | |||
* @return The created SlideShow | |||
* | |||
* @throws IOException if an error occurs while reading the data | |||
*/ | |||
private static HSLFSlideShow createHSLF(POIFSFileSystem fs, String password) throws IOException, EncryptedDocumentException { | |||
// Note: don't change the code here, it is required for Eclipse to compile the code | |||
SlideShow slideShowOrig = SlideShowFactory.create(fs, password); | |||
return (HSLFSlideShow)slideShowOrig; | |||
} | |||
/** | |||
* Helper method to avoid problems with compiling code in Eclipse | |||
* | |||
* Eclipse javac has some bugs with complex casts, this method tries | |||
* to work around this. | |||
* | |||
* @param inp The {@link InputStream} to read data from. | |||
* @param password The password that should be used or null if no password is necessary. | |||
* | |||
* @return The created SlideShow | |||
* | |||
* @throws IOException if an error occurs while reading the data | |||
* @throws EncryptedDocumentException If the wrong password is given for a protected file | |||
*/ | |||
private static HSLFSlideShow createHSLF(InputStream inp, String password) throws IOException, EncryptedDocumentException { | |||
// Note: don't change the code here, it is required for Eclipse to compile the code | |||
SlideShow slideShowOrig = SlideShowFactory.create(inp, password); | |||
return (HSLFSlideShow)slideShowOrig; | |||
} | |||
/** | |||
* Helper method to avoid problems with compiling code in Eclipse | |||
* | |||
* Eclipse javac has some bugs with complex casts, this method tries | |||
* to work around this. | |||
* | |||
* @param file The file to read data from. | |||
* @param password The password that should be used or null if no password is necessary. | |||
* @param readOnly If the SlideShow should be opened in read-only mode to avoid writing back | |||
* changes when the document is closed. | |||
* | |||
* @return The created SlideShow | |||
* | |||
* @throws IOException if an error occurs while reading the data | |||
* @throws EncryptedDocumentException If the wrong password is given for a protected file | |||
*/ | |||
private static HSLFSlideShow createHSLF(File file, String password, boolean readOnly) throws IOException, EncryptedDocumentException { | |||
// Note: don't change the code here, it is required for Eclipse to compile the code | |||
SlideShow slideShowOrig = SlideShowFactory.create(file, password, readOnly); | |||
return (HSLFSlideShow)slideShowOrig; | |||
} | |||
} |
@@ -33,6 +33,7 @@ import java.util.List; | |||
import java.util.Map; | |||
import java.util.function.Supplier; | |||
import org.apache.poi.POIDocument; | |||
import org.apache.poi.common.usermodel.GenericRecord; | |||
import org.apache.poi.common.usermodel.fonts.FontInfo; | |||
import org.apache.poi.ddf.EscherBSERecord; | |||
@@ -40,6 +41,9 @@ import org.apache.poi.ddf.EscherContainerRecord; | |||
import org.apache.poi.ddf.EscherOptRecord; | |||
import org.apache.poi.hpsf.ClassID; | |||
import org.apache.poi.hpsf.ClassIDPredefined; | |||
import org.apache.poi.hpsf.DocumentSummaryInformation; | |||
import org.apache.poi.hpsf.PropertySet; | |||
import org.apache.poi.hpsf.SummaryInformation; | |||
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; | |||
import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException; | |||
import org.apache.poi.hslf.exceptions.HSLFException; | |||
@@ -47,6 +51,7 @@ import org.apache.poi.hslf.model.HeadersFooters; | |||
import org.apache.poi.hslf.model.MovieShape; | |||
import org.apache.poi.hslf.record.*; | |||
import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet; | |||
import org.apache.poi.poifs.crypt.EncryptionInfo; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.Ole10Native; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
@@ -66,7 +71,7 @@ import org.apache.poi.util.Units; | |||
* TODO: - figure out how to match notes to their correct sheet (will involve | |||
* understanding DocSlideList and DocNotesList) - handle Slide creation cleaner | |||
*/ | |||
public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord { | |||
public final class HSLFSlideShow extends POIDocument implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord { | |||
//arbitrarily selected; may need to increase | |||
private static final int MAX_RECORD_LENGTH = 10_000_000; | |||
@@ -111,6 +116,8 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap | |||
* @param hslfSlideShow the HSLFSlideShow to base on | |||
*/ | |||
public HSLFSlideShow(HSLFSlideShowImpl hslfSlideShow) { | |||
super(hslfSlideShow.getDirectory()); | |||
loadSavePhase.set(LoadSavePhase.INIT); | |||
// Get useful things from our base slideshow | |||
@@ -1080,7 +1087,7 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap | |||
public HPSFPropertiesExtractor getMetadataTextExtractor() { | |||
return new HPSFPropertiesExtractor(getSlideShowImpl()); | |||
} | |||
int addToObjListAtom(RecordContainer exObj) { | |||
ExObjList lst = getDocumentRecord().getExObjList(true); | |||
ExObjListAtom objAtom = lst.getExObjListAtom(); | |||
@@ -1097,7 +1104,7 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap | |||
Map<String,ClassID> olemap = new HashMap<>(); | |||
olemap.put(POWERPOINT_DOCUMENT, ClassIDPredefined.POWERPOINT_V8.getClassID()); | |||
// as per BIFF8 spec | |||
olemap.put("Workbook", ClassIDPredefined.EXCEL_V8.getClassID()); | |||
olemap.put("Workbook", ClassIDPredefined.EXCEL_V8.getClassID()); | |||
// Typically from third party programs | |||
olemap.put("WORKBOOK", ClassIDPredefined.EXCEL_V8.getClassID()); | |||
// Typically odd Crystal Reports exports | |||
@@ -1179,4 +1186,94 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap | |||
public List<? extends GenericRecord> getGenericChildren() { | |||
return Arrays.asList(_hslfSlideShow.getRecords()); | |||
} | |||
@Override | |||
public void write() throws IOException { | |||
getSlideShowImpl().write(); | |||
} | |||
@Override | |||
public void write(File newFile) throws IOException { | |||
getSlideShowImpl().write(newFile); | |||
} | |||
@Override | |||
public DocumentSummaryInformation getDocumentSummaryInformation() { | |||
return getSlideShowImpl().getDocumentSummaryInformation(); | |||
} | |||
@Override | |||
public SummaryInformation getSummaryInformation() { | |||
return getSlideShowImpl().getSummaryInformation(); | |||
} | |||
@Override | |||
public void createInformationProperties() { | |||
getSlideShowImpl().createInformationProperties(); | |||
} | |||
@Override | |||
public void readProperties() { | |||
getSlideShowImpl().readProperties(); | |||
} | |||
@Override | |||
protected PropertySet getPropertySet(String setName) throws IOException { | |||
return getSlideShowImpl().getPropertySetImpl(setName); | |||
} | |||
@Override | |||
protected PropertySet getPropertySet(String setName, EncryptionInfo encryptionInfo) throws IOException { | |||
return getSlideShowImpl().getPropertySetImpl(setName, encryptionInfo); | |||
} | |||
@Override | |||
protected void writeProperties() throws IOException { | |||
getSlideShowImpl().writePropertiesImpl(); | |||
} | |||
@Override | |||
public void writeProperties(POIFSFileSystem outFS) throws IOException { | |||
getSlideShowImpl().writeProperties(outFS); | |||
} | |||
@Override | |||
protected void writeProperties(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException { | |||
getSlideShowImpl().writePropertiesImpl(outFS, writtenEntries); | |||
} | |||
@Override | |||
protected void validateInPlaceWritePossible() throws IllegalStateException { | |||
getSlideShowImpl().validateInPlaceWritePossibleImpl(); | |||
} | |||
@Override | |||
public DirectoryNode getDirectory() { | |||
return getSlideShowImpl().getDirectory(); | |||
} | |||
@Override | |||
protected void clearDirectory() { | |||
getSlideShowImpl().clearDirectoryImpl(); | |||
} | |||
@Override | |||
protected boolean initDirectory() { | |||
return getSlideShowImpl().initDirectoryImpl(); | |||
} | |||
@Override | |||
protected void replaceDirectory(DirectoryNode newDirectory) { | |||
getSlideShowImpl().replaceDirectoryImpl(newDirectory); | |||
} | |||
@Override | |||
protected String getEncryptedPropertyStreamName() { | |||
return getSlideShowImpl().getEncryptedPropertyStreamName(); | |||
} | |||
@Override | |||
public EncryptionInfo getEncryptionInfo() throws IOException { | |||
return getSlideShowImpl().getEncryptionInfo(); | |||
} | |||
} |
@@ -36,6 +36,7 @@ import java.util.NavigableMap; | |||
import java.util.TreeMap; | |||
import org.apache.poi.POIDocument; | |||
import org.apache.poi.hpsf.PropertySet; | |||
import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException; | |||
import org.apache.poi.hslf.exceptions.HSLFException; | |||
import org.apache.poi.hslf.exceptions.OldPowerPointFormatException; | |||
@@ -714,8 +715,6 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable { | |||
} | |||
/* ******************* adding methods follow ********************* */ | |||
/** | |||
@@ -850,6 +849,38 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable { | |||
return "EncryptedSummary"; | |||
} | |||
void writePropertiesImpl() throws IOException { | |||
super.writeProperties(); | |||
} | |||
PropertySet getPropertySetImpl(String setName) throws IOException { | |||
return super.getPropertySet(setName); | |||
} | |||
PropertySet getPropertySetImpl(String setName, EncryptionInfo encryptionInfo) throws IOException { | |||
return super.getPropertySet(setName, encryptionInfo); | |||
} | |||
void writePropertiesImpl(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException { | |||
super.writeProperties(outFS, writtenEntries); | |||
} | |||
void validateInPlaceWritePossibleImpl() throws IllegalStateException { | |||
super.validateInPlaceWritePossible(); | |||
} | |||
void clearDirectoryImpl() { | |||
super.clearDirectory(); | |||
} | |||
boolean initDirectoryImpl() { | |||
return super.initDirectory(); | |||
} | |||
void replaceDirectoryImpl(DirectoryNode newDirectory) { | |||
super.replaceDirectory(newDirectory); | |||
} | |||
private static class BufAccessBAOS extends ByteArrayOutputStream { | |||
public byte[] getBuf() { | |||
return buf; |
@@ -1,61 +0,0 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hsmf.extractor; | |||
import org.apache.poi.hsmf.MAPIMessage; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.util.Removal; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
/** | |||
* A text extractor for HSMF (Outlook) .msg files. | |||
* Outputs in a format somewhat like a plain text email. | |||
* | |||
* @deprecated use @{link OutlookTextExtractor} instead | |||
*/ | |||
@Deprecated | |||
@Removal(version = "5.0.0") | |||
public class OutlookTextExtactor extends OutlookTextExtractor { | |||
public OutlookTextExtactor(MAPIMessage msg) { | |||
super(msg); | |||
} | |||
public OutlookTextExtactor(DirectoryNode poifsDir) throws IOException { | |||
super(new MAPIMessage(poifsDir)); | |||
} | |||
public OutlookTextExtactor(POIFSFileSystem fs) throws IOException { | |||
super(new MAPIMessage(fs)); | |||
} | |||
public OutlookTextExtactor(InputStream inp) throws IOException { | |||
super(new MAPIMessage(inp)); | |||
} | |||
public static void main(String[] args) throws Exception { | |||
for (String filename : args) { | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(new File(filename)); | |||
OutlookTextExtractor extractor = new OutlookTextExtractor(poifs)) { | |||
System.out.println(extractor.getText()); | |||
} | |||
} | |||
} | |||
} |
@@ -42,9 +42,12 @@ import org.apache.poi.util.LocaleUtil; | |||
* | |||
* @since 4.1.2 | |||
*/ | |||
public class OutlookTextExtractor extends POIOLE2TextExtractor { | |||
public class OutlookTextExtractor implements POIOLE2TextExtractor { | |||
private final MAPIMessage msg; | |||
private boolean doCloseFilesystem = true; | |||
public OutlookTextExtractor(MAPIMessage msg) { | |||
super(msg); | |||
this.msg = msg; | |||
} | |||
public OutlookTextExtractor(DirectoryNode poifsDir) throws IOException { | |||
@@ -76,14 +79,13 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor { | |||
* Returns the underlying MAPI message | |||
*/ | |||
public MAPIMessage getMAPIMessage() { | |||
return (MAPIMessage) document; | |||
return msg; | |||
} | |||
/** | |||
* Outputs something a little like a RFC822 email | |||
*/ | |||
public String getText() { | |||
MAPIMessage msg = (MAPIMessage) document; | |||
StringBuilder s = new StringBuilder(); | |||
// See if we can get a suitable encoding for any | |||
@@ -201,4 +203,24 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor { | |||
} | |||
s.append("\n"); | |||
} | |||
@Override | |||
public MAPIMessage getDocument() { | |||
return msg; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public MAPIMessage getFilesystem() { | |||
return msg; | |||
} | |||
} |
@@ -31,13 +31,14 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
* Class to extract the text from old (Word 6 / Word 95) Word Documents. | |||
* | |||
* This should only be used on the older files, for most uses you | |||
* should call {@link WordExtractor} which deals properly | |||
* should call {@link WordExtractor} which deals properly | |||
* with HWPF. | |||
* | |||
* @author Nick Burch | |||
*/ | |||
public final class Word6Extractor extends POIOLE2TextExtractor { | |||
public final class Word6Extractor implements POIOLE2TextExtractor { | |||
private HWPFOldDocument doc; | |||
private boolean doCloseFilesystem = true; | |||
/** | |||
* Create a new Word Extractor | |||
@@ -49,12 +50,11 @@ public final class Word6Extractor extends POIOLE2TextExtractor { | |||
/** | |||
* Create a new Word Extractor | |||
* | |||
* | |||
* @param fs | |||
* POIFSFileSystem containing the word file | |||
*/ | |||
public Word6Extractor( POIFSFileSystem fs ) throws IOException | |||
{ | |||
public Word6Extractor( POIFSFileSystem fs ) throws IOException { | |||
this( fs.getRoot() ); | |||
} | |||
@@ -62,14 +62,11 @@ public final class Word6Extractor extends POIOLE2TextExtractor { | |||
* @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead | |||
*/ | |||
@Deprecated | |||
public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) | |||
throws IOException | |||
{ | |||
public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) throws IOException { | |||
this( dir ); | |||
} | |||
public Word6Extractor( DirectoryNode dir ) throws IOException | |||
{ | |||
public Word6Extractor( DirectoryNode dir ) throws IOException { | |||
this( new HWPFOldDocument( dir ) ); | |||
} | |||
@@ -78,7 +75,6 @@ public final class Word6Extractor extends POIOLE2TextExtractor { | |||
* @param doc The HWPFOldDocument to extract from | |||
*/ | |||
public Word6Extractor(HWPFOldDocument doc) { | |||
super(doc); | |||
this.doc = doc; | |||
} | |||
@@ -101,7 +97,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor { | |||
ret = new String[doc.getTextTable().getTextPieces().size()]; | |||
for(int i=0; i<ret.length; i++) { | |||
ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuilder().toString(); | |||
// Fix the line endings | |||
ret[i] = ret[i].replaceAll("\r", "\ufffe"); | |||
ret[i] = ret[i].replaceAll("\ufffe","\r\n"); | |||
@@ -111,25 +107,40 @@ public final class Word6Extractor extends POIOLE2TextExtractor { | |||
return ret; | |||
} | |||
public String getText() | |||
{ | |||
try | |||
{ | |||
public String getText() { | |||
try { | |||
WordToTextConverter wordToTextConverter = new WordToTextConverter(); | |||
wordToTextConverter.processDocument( doc ); | |||
return wordToTextConverter.getText(); | |||
} | |||
catch ( Exception exc ) | |||
{ | |||
} catch ( Exception exc ) { | |||
// fall-back | |||
StringBuilder text = new StringBuilder(); | |||
for ( String t : getParagraphText() ) | |||
{ | |||
for ( String t : getParagraphText() ) { | |||
text.append( t ); | |||
} | |||
return text.toString(); | |||
} | |||
} | |||
@Override | |||
public HWPFOldDocument getDocument() { | |||
return doc; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public HWPFOldDocument getFilesystem() { | |||
return doc; | |||
} | |||
} |
@@ -17,7 +17,6 @@ | |||
package org.apache.poi.hwpf.extractor; | |||
import java.io.FileInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
@@ -39,8 +38,9 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
* | |||
* @author Nick Burch | |||
*/ | |||
public final class WordExtractor extends POIOLE2TextExtractor { | |||
private HWPFDocument doc; | |||
public final class WordExtractor implements POIOLE2TextExtractor { | |||
private final HWPFDocument doc; | |||
private boolean doCloseFilesystem = true; | |||
/** | |||
* Create a new Word Extractor | |||
@@ -73,29 +73,9 @@ public final class WordExtractor extends POIOLE2TextExtractor { | |||
* The HWPFDocument to extract from | |||
*/ | |||
public WordExtractor( HWPFDocument doc ) { | |||
super( doc ); | |||
this.doc = doc; | |||
} | |||
/** | |||
* Command line extractor, so people will stop moaning that they can't just | |||
* run this. | |||
*/ | |||
public static void main( String[] args ) throws IOException { | |||
if ( args.length == 0 ) { | |||
System.err.println( "Use:" ); | |||
System.err | |||
.println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" ); | |||
System.exit( 1 ); | |||
} | |||
// Process the first argument as a file | |||
InputStream fin = new FileInputStream( args[0] ); | |||
try (WordExtractor extractor = new WordExtractor(fin)) { | |||
System.out.println(extractor.getText()); | |||
} | |||
} | |||
/** | |||
* Get the text from the word file, as an array with one String per | |||
* paragraph | |||
@@ -142,7 +122,7 @@ public final class WordExtractor extends POIOLE2TextExtractor { | |||
return getParagraphText( r ); | |||
} | |||
protected static String[] getParagraphText( Range r ) { | |||
static String[] getParagraphText( Range r ) { | |||
String[] ret; | |||
ret = new String[r.numParagraphs()]; | |||
for ( int i = 0; i < ret.length; i++ ) { | |||
@@ -287,8 +267,27 @@ public final class WordExtractor extends POIOLE2TextExtractor { | |||
/** | |||
* Removes any fields (eg macros, page markers etc) from the string. | |||
*/ | |||
public static String stripFields( String text ) | |||
{ | |||
public static String stripFields( String text ) { | |||
return Range.stripFields( text ); | |||
} | |||
@Override | |||
public HWPFDocument getDocument() { | |||
return doc; | |||
} | |||
@Override | |||
public void setCloseFilesystem(boolean doCloseFilesystem) { | |||
this.doCloseFilesystem = doCloseFilesystem; | |||
} | |||
@Override | |||
public boolean isCloseFilesystem() { | |||
return doCloseFilesystem; | |||
} | |||
@Override | |||
public HWPFDocument getFilesystem() { | |||
return doc; | |||
} | |||
} |
@@ -19,12 +19,9 @@ package org.apache.poi.hdgf.extractor; | |||
import static org.junit.Assert.assertEquals; | |||
import static org.junit.Assert.assertNotNull; | |||
import static org.junit.Assert.assertTrue; | |||
import java.io.ByteArrayOutputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.io.PrintStream; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.hdgf.HDGFDiagram; | |||
@@ -32,7 +29,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.junit.Test; | |||
public final class TestVisioExtractor { | |||
private static POIDataSamples _dgTests = POIDataSamples.getDiagramInstance(); | |||
private static final POIDataSamples _dgTests = POIDataSamples.getDiagramInstance(); | |||
private final String defFilename = "Test_Visio-Some_Random_Text.vsd"; | |||
private final int defTextChunks = 5; | |||
@@ -63,7 +60,7 @@ public final class TestVisioExtractor { | |||
is3.close(); | |||
HDGFDiagram hdgf3 = new HDGFDiagram(poifs3); | |||
VisioTextExtractor extractor3 = new VisioTextExtractor(hdgf3); | |||
assertNotNull(extractor3); | |||
assertNotNull(extractor3.getAllText()); | |||
@@ -97,7 +94,7 @@ public final class TestVisioExtractor { | |||
@Test | |||
public void testProblemFiles() throws Exception { | |||
String[] files = { | |||
"44594.vsd", "44594-2.vsd", | |||
"44594.vsd", "44594-2.vsd", | |||
"ShortChunk1.vsd", "ShortChunk2.vsd", "ShortChunk3.vsd", | |||
"NegativeChunkLength.vsd", "NegativeChunkLength2.vsd" | |||
}; | |||
@@ -108,31 +105,6 @@ public final class TestVisioExtractor { | |||
} | |||
} | |||
@Test | |||
public void testMain() throws Exception { | |||
PrintStream oldOut = System.out; | |||
ByteArrayOutputStream baos = new ByteArrayOutputStream(); | |||
PrintStream capture = new PrintStream(baos); | |||
System.setOut(capture); | |||
String path = _dgTests.getFile(defFilename).getPath(); | |||
VisioTextExtractor.main(new String[] {path}); | |||
// Put things back | |||
System.setOut(oldOut); | |||
// Check | |||
capture.flush(); | |||
String text = baos.toString(); | |||
// YK: stdout can contain lots of other stuff if logging is sent to console | |||
// ( -Dorg.apache.poi.util.POILogger=org.apache.poi.util.SystemOutLogger) | |||
assertTrue( text.contains( | |||
"text\nView\n" + | |||
"Test View\nI am a test view\n" + | |||
"Some random text, on a page\n" | |||
)); | |||
} | |||
private VisioTextExtractor openExtractor(String fileName) throws IOException { | |||
try (InputStream is = _dgTests.openResourceAsStream(fileName)) { | |||
return new VisioTextExtractor(is); |
@@ -42,7 +42,6 @@ import org.apache.poi.hsmf.datatypes.PropertyValue; | |||
import org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue; | |||
import org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue; | |||
import org.apache.poi.hsmf.dev.HSMFDump; | |||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor; | |||
import org.apache.poi.hsmf.extractor.OutlookTextExtractor; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.util.LocaleUtil; | |||
@@ -74,23 +73,23 @@ public final class TestFixedSizedProperties { | |||
fsMessageFails = new POIFSFileSystem(samples.getFile(messageFails)); | |||
mapiMessageSucceeds = new MAPIMessage(fsMessageSucceeds); | |||
mapiMessageFails = new MAPIMessage(fsMessageFails); | |||
mapiMessageFails = new MAPIMessage(fsMessageFails); | |||
messageDateFormat = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss", Locale.ROOT); | |||
messageDateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); | |||
messageDateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); | |||
userTimeZone = LocaleUtil.getUserTimeZone(); | |||
LocaleUtil.setUserTimeZone(LocaleUtil.TIMEZONE_UTC); | |||
} | |||
@AfterClass | |||
public static void closeFS() throws Exception { | |||
LocaleUtil.setUserTimeZone(userTimeZone); | |||
fsMessageSucceeds.close(); | |||
fsMessageFails.close(); | |||
} | |||
/** | |||
* Check we can find a sensible number of properties on a few | |||
* of our test files | |||
@@ -98,21 +97,21 @@ public final class TestFixedSizedProperties { | |||
@Test | |||
public void testPropertiesFound() { | |||
Map<MAPIProperty,List<PropertyValue>> props; | |||
props = mapiMessageSucceeds.getMainChunks().getProperties(); | |||
assertTrue(props.toString(), props.size() > 10); | |||
props = mapiMessageFails.getMainChunks().getProperties(); | |||
assertTrue(props.toString(), props.size() > 10); | |||
} | |||
/** | |||
* Check we find properties of a variety of different types | |||
*/ | |||
@Test | |||
public void testPropertyValueTypes() { | |||
Chunks mainChunks = mapiMessageSucceeds.getMainChunks(); | |||
// Ask to have the values looked up | |||
Map<MAPIProperty,List<PropertyValue>> props = mainChunks.getProperties(); | |||
HashSet<Class<? extends PropertyValue>> seenTypes = | |||
@@ -126,7 +125,7 @@ public final class TestFixedSizedProperties { | |||
assertTrue(seenTypes.toString(), seenTypes.contains(LongPropertyValue.class)); | |||
assertTrue(seenTypes.toString(), seenTypes.contains(TimePropertyValue.class)); | |||
assertFalse(seenTypes.toString(), seenTypes.contains(ChunkBasedPropertyValue.class)); | |||
// Ask for the raw values | |||
seenTypes.clear(); | |||
for (PropertyValue pv : mainChunks.getRawProperties().values()) { | |||
@@ -144,31 +143,21 @@ public final class TestFixedSizedProperties { | |||
@Test | |||
public void testReadMessageDateSucceedsWithOutlookTextExtractor() throws Exception { | |||
OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageSucceeds); | |||
ext.setFilesystem(null); // Don't close re-used test resources here | |||
ext.setCloseFilesystem(false); | |||
String text = ext.getText(); | |||
assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n"); | |||
ext.close(); | |||
} | |||
@Test | |||
public void testReadMessageDateSucceedsWithOutlookTextExtactor() throws Exception { | |||
OutlookTextExtactor ext = new OutlookTextExtactor(mapiMessageSucceeds); | |||
ext.setFilesystem(null); // Don't close re-used test resources here | |||
String text = ext.getText(); | |||
assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n"); | |||
ext.close(); | |||
} | |||
/** | |||
* Test to see if we can read the Date Chunk with OutlookTextExtractor. | |||
*/ | |||
@Test | |||
public void testReadMessageDateFailsWithOutlookTextExtractor() throws Exception { | |||
OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageFails); | |||
ext.setFilesystem(null); // Don't close re-used test resources here | |||
ext.setCloseFilesystem(false); | |||
String text = ext.getText(); | |||
assertContains(text, "Date: Thu, 21 Jun 2012 14:14:04 +0000\n"); | |||
ext.close(); | |||
@@ -182,7 +171,7 @@ public final class TestFixedSizedProperties { | |||
PrintStream stream = new PrintStream(new ByteArrayOutputStream()); | |||
HSMFDump dump = new HSMFDump(fsMessageSucceeds); | |||
dump.dump(stream); | |||
} | |||
} | |||
/** | |||
* Test to see if we can read the Date Chunk with HSMFDump. | |||
@@ -202,19 +191,19 @@ public final class TestFixedSizedProperties { | |||
// Check via the message date | |||
Calendar clientSubmitTime = mapiMessageSucceeds.getMessageDate(); | |||
assertEquals( | |||
"Fri, 22 Jun 2012 18:32:54", | |||
"Fri, 22 Jun 2012 18:32:54", | |||
messageDateFormat.format(clientSubmitTime.getTime())); | |||
// Fetch the property value directly | |||
Map<MAPIProperty,List<PropertyValue>> props = | |||
mapiMessageSucceeds.getMainChunks().getProperties(); | |||
List<PropertyValue> pv = props.get(MAPIProperty.CLIENT_SUBMIT_TIME); | |||
List<PropertyValue> pv = props.get(MAPIProperty.CLIENT_SUBMIT_TIME); | |||
assertNotNull(pv); | |||
assertEquals(1, pv.size()); | |||
clientSubmitTime = (Calendar)pv.get(0).getValue(); | |||
assertEquals( | |||
"Fri, 22 Jun 2012 18:32:54", | |||
"Fri, 22 Jun 2012 18:32:54", | |||
messageDateFormat.format(clientSubmitTime.getTime())); | |||
} | |||
} |
@@ -20,7 +20,6 @@ package org.apache.poi.hsmf.extractor; | |||
import static org.apache.poi.POITestCase.assertContains; | |||
import static org.apache.poi.POITestCase.assertNotContained; | |||
import static org.junit.Assert.assertEquals; | |||
import static org.junit.Assert.assertTrue; | |||
import java.io.FileInputStream; | |||
import java.text.SimpleDateFormat; | |||
@@ -57,68 +56,62 @@ public final class TestOutlookTextExtractor { | |||
@Test | |||
public void testQuick() throws Exception { | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg); | |||
String text = ext.getText(); | |||
assertContains(text, "From: Kevin Roast\n"); | |||
assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n"); | |||
assertNotContained(text, "CC:"); | |||
assertNotContained(text, "BCC:"); | |||
assertNotContained(text, "Attachment:"); | |||
assertContains(text, "Subject: Test the content transformer\n"); | |||
Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55); | |||
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT); | |||
f.setTimeZone(LocaleUtil.getUserTimeZone()); | |||
String dateText = f.format(cal.getTime()); | |||
assertContains(text, "Date: " + dateText + "\n"); | |||
assertContains(text, "The quick brown fox jumps over the lazy dog"); | |||
ext.close(); | |||
poifs.close(); | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { | |||
String text = ext.getText(); | |||
assertContains(text, "From: Kevin Roast\n"); | |||
assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n"); | |||
assertNotContained(text, "CC:"); | |||
assertNotContained(text, "BCC:"); | |||
assertNotContained(text, "Attachment:"); | |||
assertContains(text, "Subject: Test the content transformer\n"); | |||
Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55); | |||
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT); | |||
f.setTimeZone(LocaleUtil.getUserTimeZone()); | |||
String dateText = f.format(cal.getTime()); | |||
assertContains(text, "Date: " + dateText + "\n"); | |||
assertContains(text, "The quick brown fox jumps over the lazy dog"); | |||
} | |||
} | |||
@Test | |||
public void testSimple() throws Exception { | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg); | |||
String text = ext.getText(); | |||
assertContains(text, "From: Travis Ferguson\n"); | |||
assertContains(text, "To: travis@overwrittenstack.com\n"); | |||
assertNotContained(text, "CC:"); | |||
assertNotContained(text, "BCC:"); | |||
assertContains(text, "Subject: test message\n"); | |||
assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n"); | |||
assertContains(text, "This is a test message."); | |||
ext.close(); | |||
poifs.close(); | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { | |||
String text = ext.getText(); | |||
assertContains(text, "From: Travis Ferguson\n"); | |||
assertContains(text, "To: travis@overwrittenstack.com\n"); | |||
assertNotContained(text, "CC:"); | |||
assertNotContained(text, "BCC:"); | |||
assertContains(text, "Subject: test message\n"); | |||
assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n"); | |||
assertContains(text, "This is a test message."); | |||
} | |||
} | |||
@Test | |||
public void testConstructors() throws Exception { | |||
FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(fis); | |||
String inp = ext.getText(); | |||
ext.close(); | |||
fis.close(); | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); | |||
ext = new OutlookTextExtractor(poifs); | |||
String poifsTxt = ext.getText(); | |||
ext.close(); | |||
poifs.close(); | |||
fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); | |||
ext = new OutlookTextExtractor(new MAPIMessage(fis)); | |||
String mapi = ext.getText(); | |||
ext.close(); | |||
fis.close(); | |||
String inp; | |||
try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(fis)) { | |||
inp = ext.getText(); | |||
} | |||
String poifsTxt; | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(poifs)){ | |||
poifsTxt = ext.getText(); | |||
} | |||
String mapi; | |||
try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(new MAPIMessage(fis))) { | |||
mapi = ext.getText(); | |||
} | |||
assertEquals(inp, poifsTxt); | |||
assertEquals(inp, mapi); | |||
@@ -142,25 +135,22 @@ public final class TestOutlookTextExtractor { | |||
"example_sent_regular.msg", "example_sent_unicode.msg" | |||
}; | |||
for (String file : files) { | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg); | |||
String text = ext.getText(); | |||
assertContains(text, "From: Mike Farman\n"); | |||
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + | |||
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); | |||
assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " + | |||
"'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); | |||
assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " + | |||
"'Vonka Jan' <jan.vonka@alfresco.com>\n"); | |||
assertContains(text, "Subject: This is a test message please ignore\n"); | |||
assertContains(text, "Date:"); | |||
assertContains(text, "The quick brown fox jumps over the lazy dog"); | |||
ext.close(); | |||
poifs.close(); | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { | |||
String text = ext.getText(); | |||
assertContains(text, "From: Mike Farman\n"); | |||
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + | |||
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); | |||
assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " + | |||
"'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); | |||
assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " + | |||
"'Vonka Jan' <jan.vonka@alfresco.com>\n"); | |||
assertContains(text, "Subject: This is a test message please ignore\n"); | |||
assertContains(text, "Date:"); | |||
assertContains(text, "The quick brown fox jumps over the lazy dog"); | |||
} | |||
} | |||
} | |||
@@ -182,25 +172,21 @@ public final class TestOutlookTextExtractor { | |||
"example_received_regular.msg", "example_received_unicode.msg" | |||
}; | |||
for (String file : files) { | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg); | |||
String text = ext.getText(); | |||
assertContains(text, "From: Mike Farman\n"); | |||
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + | |||
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); | |||
assertContains(text, "CC: nickb@alfresco.com; " + | |||
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); | |||
assertNotContained(text, "BCC:"); | |||
assertContains(text, "Subject: This is a test message please ignore\n"); | |||
assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly | |||
assertContains(text, "The quick brown fox jumps over the lazy dog"); | |||
ext.close(); | |||
poifs.close(); | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { | |||
String text = ext.getText(); | |||
assertContains(text, "From: Mike Farman\n"); | |||
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + | |||
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); | |||
assertContains(text, "CC: nickb@alfresco.com; " + | |||
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); | |||
assertNotContained(text, "BCC:"); | |||
assertContains(text, "Subject: This is a test message please ignore\n"); | |||
assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly | |||
assertContains(text, "The quick brown fox jumps over the lazy dog"); | |||
} | |||
} | |||
} | |||
@@ -210,85 +196,59 @@ public final class TestOutlookTextExtractor { | |||
@SuppressWarnings("JavadocReference") | |||
@Test | |||
public void testWithAttachments() throws Exception { | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg); | |||
// Check the normal bits | |||
String text = ext.getText(); | |||
assertContains(text, "From: Nicolas1"); | |||
assertContains(text, "To: 'nicolas1.23456@free.fr'"); | |||
assertNotContained(text, "CC:"); | |||
assertNotContained(text, "BCC:"); | |||
assertContains(text, "Subject: test"); | |||
assertContains(text, "Date: Wed, 22 Apr"); | |||
assertContains(text, "Attachment: test-unicode.doc\n"); | |||
assertContains(text, "Attachment: pj1.txt\n"); | |||
assertContains(text, "contenu"); | |||
// Embeded bits are checked in | |||
// TestExtractorFactory | |||
ext.close(); | |||
poifs.close(); | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { | |||
// Check the normal bits | |||
String text = ext.getText(); | |||
assertContains(text, "From: Nicolas1"); | |||
assertContains(text, "To: 'nicolas1.23456@free.fr'"); | |||
assertNotContained(text, "CC:"); | |||
assertNotContained(text, "BCC:"); | |||
assertContains(text, "Subject: test"); | |||
assertContains(text, "Date: Wed, 22 Apr"); | |||
assertContains(text, "Attachment: test-unicode.doc\n"); | |||
assertContains(text, "Attachment: pj1.txt\n"); | |||
assertContains(text, "contenu"); | |||
// Embeded bits are checked in | |||
// TestExtractorFactory | |||
} | |||
} | |||
@Test | |||
public void testWithAttachedMessage() throws Exception { | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg); | |||
String text = ext.getText(); | |||
// Check we got bits from the main message | |||
assertContains(text, "Master mail"); | |||
assertContains(text, "ante in lacinia euismod"); | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { | |||
String text = ext.getText(); | |||
// But not the attached message | |||
assertNotContained(text, "Test mail attachment"); | |||
assertNotContained(text, "Lorem ipsum dolor sit"); | |||
// Check we got bits from the main message | |||
assertContains(text, "Master mail"); | |||
assertContains(text, "ante in lacinia euismod"); | |||
ext.close(); | |||
poifs.close(); | |||
// But not the attached message | |||
assertNotContained(text, "Test mail attachment"); | |||
assertNotContained(text, "Lorem ipsum dolor sit"); | |||
} | |||
} | |||
@Test | |||
public void testEncodings() throws Exception { | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg); | |||
String text = ext.getText(); | |||
// Check the english bits | |||
assertContains(text, "From: Tests Chang@FT"); | |||
assertContains(text, "tests.chang@fengttt.com"); | |||
// And check some chinese bits | |||
assertContains(text, "(\u5f35\u6bd3\u502b)"); | |||
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); | |||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { | |||
String text = ext.getText(); | |||
ext.close(); | |||
poifs.close(); | |||
} | |||
// Check the english bits | |||
assertContains(text, "From: Tests Chang@FT"); | |||
assertContains(text, "tests.chang@fengttt.com"); | |||
@Test | |||
public void testEncodingsDeprecatedClass() throws Exception { | |||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); | |||
MAPIMessage msg = new MAPIMessage(poifs); | |||
OutlookTextExtactor ext = new OutlookTextExtactor(msg); | |||
assertTrue("OutlookTextExtactor instanceof OutlookTextExtractor", ext instanceof OutlookTextExtractor); | |||
String text = ext.getText(); | |||
// Check the english bits | |||
assertContains(text, "From: Tests Chang@FT"); | |||
assertContains(text, "tests.chang@fengttt.com"); | |||
// And check some chinese bits | |||
assertContains(text, "(\u5f35\u6bd3\u502b)"); | |||
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); | |||
ext.close(); | |||
poifs.close(); | |||
// And check some chinese bits | |||
assertContains(text, "(\u5f35\u6bd3\u502b)"); | |||
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); | |||
} | |||
} | |||
} |
@@ -17,16 +17,16 @@ | |||
package org.apache.poi.hwpf.extractor; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.extractor.OLE2ExtractorFactory; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.junit.Test; | |||
import static org.junit.Assert.assertNotNull; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import static org.junit.Assert.assertNotNull; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
import org.apache.poi.extractor.POITextExtractor; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.junit.Test; | |||
/** | |||
* Tests for bugs with the WordExtractor | |||
@@ -61,7 +61,7 @@ public final class TestWordExtractorBugs { | |||
@Test | |||
public void testBug60374() throws Exception { | |||
POIFSFileSystem fs = new POIFSFileSystem(SAMPLES.openResourceAsStream("cn.orthodox.www_divenbog_APRIL_30-APRIL.DOC")); | |||
final POITextExtractor extractor = OLE2ExtractorFactory.createExtractor(fs); | |||
final POITextExtractor extractor = ExtractorFactory.createExtractor(fs); | |||
// Check it gives text without error | |||
assertNotNull(extractor.getText()); |
@@ -25,7 +25,7 @@ import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.hpsf.*; | |||
import org.apache.poi.hpsf.Thumbnail; | |||
import org.apache.poi.hssf.HSSFTestDataSamples; | |||
import org.apache.poi.hssf.extractor.ExcelExtractor; | |||
import org.apache.poi.hssf.usermodel.HSSFWorkbook; | |||
@@ -101,42 +101,31 @@ public final class TestHPSFPropertiesExtractor { | |||
@Test | |||
public void testConstructors() throws IOException { | |||
POIFSFileSystem fs; | |||
HSSFWorkbook wb; | |||
try { | |||
fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls")); | |||
wb = new HSSFWorkbook(fs); | |||
} catch (IOException e) { | |||
throw new RuntimeException(e); | |||
} | |||
ExcelExtractor excelExt = new ExcelExtractor(wb); | |||
final String fsText; | |||
HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs); | |||
fsExt.setFilesystem(null); // Don't close re-used test resources! | |||
try { | |||
fsText = fsExt.getText(); | |||
} finally { | |||
fsExt.close(); | |||
} | |||
final String hwText; | |||
HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb); | |||
hwExt.setFilesystem(null); // Don't close re-used test resources! | |||
try { | |||
hwText = hwExt.getText(); | |||
} finally { | |||
hwExt.close(); | |||
} | |||
final String eeText; | |||
HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt); | |||
eeExt.setFilesystem(null); // Don't close re-used test resources! | |||
try { | |||
eeText = eeExt.getText(); | |||
} finally { | |||
eeExt.close(); | |||
wb.close(); | |||
try (POIFSFileSystem fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls")); | |||
HSSFWorkbook wb = new HSSFWorkbook(fs); | |||
ExcelExtractor excelExt = new ExcelExtractor(wb)) { | |||
try (HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs)) { | |||
// Don't close re-used test resources! | |||
fsExt.setCloseFilesystem(false); | |||
fsText = fsExt.getText(); | |||
} | |||
try (HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb)) { | |||
// Don't close re-used test resources! | |||
hwExt.setCloseFilesystem(false); | |||
hwText = hwExt.getText(); | |||
} | |||
try (HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt)) { | |||
// Don't close re-used test resources! | |||
eeExt.setCloseFilesystem(false); | |||
eeText = eeExt.getText(); | |||
} | |||
} | |||
assertEquals(fsText, hwText); |
@@ -43,9 +43,7 @@ public final class TestExcelExtractor { | |||
private static ExcelExtractor createExtractor(String sampleFileName) throws IOException { | |||
File file = HSSFTestDataSamples.getSampleFile(sampleFileName); | |||
POIFSFileSystem fs = new POIFSFileSystem(file); | |||
ExcelExtractor extractor = new ExcelExtractor(fs); | |||
extractor.setFilesystem(fs); | |||
return extractor; | |||
return new ExcelExtractor(fs); | |||
} | |||
@Test | |||
@@ -223,16 +221,16 @@ public final class TestExcelExtractor { | |||
extractor.setIncludeBlankCells(false); | |||
extractor.setIncludeSheetNames(false); | |||
String text = extractor.getText(); | |||
// Note - not all the formats in the file | |||
// actually quite match what they claim to | |||
// be, as some are auto-local builtins... | |||
assertStartsWith(text, "Dates, all 24th November 2006\n"); | |||
assertContains(text, "yyyy/mm/dd\t2006/11/24\n"); | |||
assertContains(text, "yyyy-mm-dd\t2006-11-24\n"); | |||
assertContains(text, "dd-mm-yy\t24-11-06\n"); | |||
assertContains(text, "nn.nn\t10.52\n"); | |||
assertContains(text, "nn.nnn\t10.520\n"); | |||
assertContains(text, "\u00a3nn.nn\t\u00a310.52\n"); | |||
@@ -247,7 +245,7 @@ public final class TestExcelExtractor { | |||
@Test | |||
public void testWithEmbeded() throws Exception { | |||
POIFSFileSystem fs = null; | |||
HSSFWorkbook wbA = null, wbB = null; | |||
ExcelExtractor exA = null, exB = null; | |||
@@ -257,7 +255,7 @@ public final class TestExcelExtractor { | |||
DirectoryNode objPool = (DirectoryNode) fs.getRoot().getEntry("ObjectPool"); | |||
DirectoryNode dirA = (DirectoryNode) objPool.getEntry("_1269427460"); | |||
DirectoryNode dirB = (DirectoryNode) objPool.getEntry("_1269427461"); | |||
wbA = new HSSFWorkbook(dirA, fs, true); | |||
exA = new ExcelExtractor(wbA); | |||
wbB = new HSSFWorkbook(dirB, fs, true); | |||
@@ -299,10 +297,10 @@ public final class TestExcelExtractor { | |||
exB = new ExcelExtractor(wbB); | |||
assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText()); | |||
assertEquals("Sample Excel", exA.getSummaryInformation().getTitle()); | |||
assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText()); | |||
assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle()); | |||
// And the base file too | |||
ex = new ExcelExtractor(fs); | |||
assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n", ex.getText()); |