diff options
author | Andreas Beeker <kiwiwings@apache.org> | 2020-08-13 21:08:24 +0000 |
---|---|---|
committer | Andreas Beeker <kiwiwings@apache.org> | 2020-08-13 21:08:24 +0000 |
commit | dfdf9e6d6f470b82ad2a6b77e3059dd0df23905b (patch) | |
tree | cf5cdd45adcf98e078beb6e3841cfc7a502a7dd6 /src/scratchpad | |
parent | 4bf968d6bd96d51347380f3127e64dbb525c664d (diff) | |
download | poi-dfdf9e6d6f470b82ad2a6b77e3059dd0df23905b.tar.gz poi-dfdf9e6d6f470b82ad2a6b77e3059dd0df23905b.zip |
#64411 - Provide JigSaw modules
- rework extractors - see bugzilla entry for more information
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1880839 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/scratchpad')
14 files changed, 486 insertions, 722 deletions
diff --git a/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java index 73d9f74844..16711d0b11 100644 --- a/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java +++ b/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java @@ -17,44 +17,66 @@ package org.apache.poi.extractor.ole2; import java.io.ByteArrayInputStream; +import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import java.util.Iterator; import java.util.List; +import java.util.stream.StreamSupport; +import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.extractor.ExtractorProvider; import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POITextExtractor; -import org.apache.poi.extractor.OLE2ExtractorFactory; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor; +import org.apache.poi.hslf.usermodel.HSLFShape; import org.apache.poi.hslf.usermodel.HSLFSlideShow; +import org.apache.poi.hslf.usermodel.HSLFTextParagraph; import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.extractor.OutlookTextExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.usermodel.SlideShowFactory; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; /** - * Scratchpad-specific logic for {@link OLE2ExtractorFactory} and + * Scratchpad-specific logic for {@link ExtractorFactory} and * {@link org.apache.poi.extractor.ExtractorFactory}, which permit the other two to run with * no Scratchpad jar (though without functionality!) * <p>Note - should not be used standalone, always use via the other * two classes</p> */ @SuppressWarnings("WeakerAccess") -public class OLE2ScratchpadExtractorFactory { +public class OLE2ScratchpadExtractorFactory implements ExtractorProvider { private static final POILogger logger = POILogFactory.getLogger(OLE2ScratchpadExtractorFactory.class); + @Override + public boolean accepts(FileMagic fm) { + return FileMagic.OLE2 == fm; + } + + @Override + public POITextExtractor create(File file, String password) throws IOException { + return create(new POIFSFileSystem(file, true).getRoot(), password); + } + + @Override + public POITextExtractor create(InputStream inputStream, String password) throws IOException { + return create(new POIFSFileSystem(inputStream).getRoot(), password); + } + /** * Look for certain entries in the stream, to figure it * out what format is desired @@ -66,48 +88,54 @@ public class OLE2ScratchpadExtractorFactory { * * @throws IOException when the format specific extraction fails because of invalid entires */ - public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException { - if (poifsDir.hasEntry("WordDocument")) { - // Old or new style word document? - try { - return new WordExtractor(poifsDir); - } catch (OldWordFileFormatException e) { - return new Word6Extractor(poifsDir); + public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException { + final String oldPW = Biff8EncryptionKey.getCurrentUserPassword(); + try { + Biff8EncryptionKey.setCurrentUserPassword(password); + if (poifsDir.hasEntry("WordDocument")) { + // Old or new style word document? + try { + return new WordExtractor(poifsDir); + } catch (OldWordFileFormatException e) { + return new Word6Extractor(poifsDir); + } } - } - if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { - return new SlideShowExtractor(SlideShowFactory.create(poifsDir)); - } + if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { + return new SlideShowExtractor<HSLFShape, HSLFTextParagraph>(SlideShowFactory.create(poifsDir)); + } - if (poifsDir.hasEntry("VisioDocument")) { - return new VisioTextExtractor(poifsDir); - } + if (poifsDir.hasEntry("VisioDocument")) { + return new VisioTextExtractor(poifsDir); + } - if (poifsDir.hasEntry("Quill")) { - return new PublisherTextExtractor(poifsDir); - } + if (poifsDir.hasEntry("Quill")) { + return new PublisherTextExtractor(poifsDir); + } - final String[] outlookEntryNames = new String[] { - // message bodies, saved as plain text (PtypString) - // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) - // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry - // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx - // @see org.apache.poi.hsmf.Types.MAPIType - "__substg1.0_1000001E", //PidTagBody ASCII - "__substg1.0_1000001F", //PidTagBody Unicode - "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII - "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode - "__substg1.0_0037001E", //PidTagSubject ASCII - "__substg1.0_0037001F", //PidTagSubject Unicode - }; - for (String entryName : outlookEntryNames) { - if (poifsDir.hasEntry(entryName)) { - return new OutlookTextExtractor(poifsDir); + final String[] outlookEntryNames = new String[]{ + // message bodies, saved as plain text (PtypString) + // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) + // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry + // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx + // @see org.apache.poi.hsmf.Types.MAPIType + "__substg1.0_1000001E", //PidTagBody ASCII + "__substg1.0_1000001F", //PidTagBody Unicode + "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII + "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode + "__substg1.0_0037001E", //PidTagSubject ASCII + "__substg1.0_0037001F", //PidTagSubject Unicode + }; + for (String entryName : outlookEntryNames) { + if (poifsDir.hasEntry(entryName)) { + return new OutlookTextExtractor(poifsDir); + } } + } finally { + Biff8EncryptionKey.setCurrentUserPassword(oldPW); } - throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); + return null; } /** @@ -120,10 +148,9 @@ public class OLE2ScratchpadExtractorFactory { * @param ext the extractor holding the directory to start parsing * @param dirs a list to be filled with directory references holding embedded * @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries - * - * @throws IOException when the format specific extraction fails because of invalid entires */ - public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException { + @Override + public void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) { // Find all the embedded directories DirectoryEntry root = ext.getRoot(); if (root == null) { @@ -132,25 +159,16 @@ public class OLE2ScratchpadExtractorFactory { if (ext instanceof ExcelExtractor) { // These are in MBD... under the root - Iterator<Entry> it = root.getEntries(); - while (it.hasNext()) { - Entry entry = it.next(); - if (entry.getName().startsWith("MBD")) { - dirs.add(entry); - } - } + StreamSupport.stream(root.spliterator(), false) + .filter(entry -> entry.getName().startsWith("MBD")) + .forEach(dirs::add); } else if (ext instanceof WordExtractor) { // These are in ObjectPool -> _... under the root try { - DirectoryEntry op = (DirectoryEntry) - root.getEntry("ObjectPool"); - Iterator<Entry> it = op.getEntries(); - while(it.hasNext()) { - Entry entry = it.next(); - if(entry.getName().startsWith("_")) { - dirs.add(entry); - } - } + DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); + StreamSupport.stream(op.spliterator(), false) + .filter(entry -> entry.getName().startsWith("_")) + .forEach(dirs::add); } catch(FileNotFoundException e) { logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage()); // ignored here diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java index d21a7e5d6a..570eaacf4c 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java @@ -17,7 +17,6 @@ package org.apache.poi.hdgf.extractor; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -38,11 +37,11 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * Can operate on the command line (outputs to stdout), or * can return the text for you (example: for use with Lucene). */ -public final class VisioTextExtractor extends POIOLE2TextExtractor { +public final class VisioTextExtractor implements POIOLE2TextExtractor { private HDGFDiagram hdgf; + private boolean doCloseFilesystem = true; public VisioTextExtractor(HDGFDiagram hdgf) { - super(hdgf); this.hdgf = hdgf; } public VisioTextExtractor(POIFSFileSystem fs) throws IOException { @@ -91,9 +90,7 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor { // Capture the text, as long as it isn't // simply an empty string String str = cmd.getValue().toString(); - if(str.isEmpty() || "\n".equals(str)) { - // Ignore empty strings - } else { + if (!(str.isEmpty() || "\n".equals(str))) { text.add( str ); } } @@ -121,21 +118,23 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor { return text.toString(); } - public static void main(String[] args) throws Exception { - if(args.length == 0) { - System.err.println("Use:"); - System.err.println(" VisioTextExtractor <file.vsd>"); - System.exit(1); - } + @Override + public HDGFDiagram getDocument() { + return hdgf; + } - try (FileInputStream fis = new FileInputStream(args[0])) { - VisioTextExtractor extractor = - new VisioTextExtractor(fis); + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } - // Print not PrintLn as already has \n added to it - System.out.print(extractor.getText()); + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } - extractor.close(); - } + @Override + public HDGFDiagram getFilesystem() { + return hdgf; } } diff --git a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java index bd442b8da6..ac7ed74153 100644 --- a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java @@ -17,35 +17,37 @@ package org.apache.poi.hpbf.extractor; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.hpbf.HPBFDocument; import org.apache.poi.hpbf.model.qcbits.QCBit; -import org.apache.poi.hpbf.model.qcbits.QCTextBit; import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12; +import org.apache.poi.hpbf.model.qcbits.QCTextBit; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * Extract text from HPBF Publisher files */ -public final class PublisherTextExtractor extends POIOLE2TextExtractor { - private HPBFDocument doc; +public final class PublisherTextExtractor implements POIOLE2TextExtractor { + private final HPBFDocument doc; private boolean hyperlinksByDefault; + private boolean doCloseFilesystem = true; public PublisherTextExtractor(HPBFDocument doc) { - super(doc); this.doc = doc; } + public PublisherTextExtractor(DirectoryNode dir) throws IOException { this(new HPBFDocument(dir)); } + public PublisherTextExtractor(POIFSFileSystem fs) throws IOException { this(new HPBFDocument(fs)); } + public PublisherTextExtractor(InputStream is) throws IOException { this(new POIFSFileSystem(is)); } @@ -66,7 +68,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { // Get the text from the Quill Contents QCBit[] bits = doc.getQuillContents().getBits(); for (QCBit bit1 : bits) { - if (bit1 != null && bit1 instanceof QCTextBit) { + if (bit1 instanceof QCTextBit) { QCTextBit t = (QCTextBit) bit1; text.append(t.getText().replace('\r', '\n')); } @@ -79,7 +81,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { // how to tie that together. if(hyperlinksByDefault) { for (QCBit bit : bits) { - if (bit != null && bit instanceof Type12) { + if (bit instanceof Type12) { Type12 hyperlinks = (Type12) bit; for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) { text.append("<"); @@ -96,19 +98,23 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { return text.toString(); } + @Override + public HPBFDocument getDocument() { + return doc; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } - public static void main(String[] args) throws Exception { - if(args.length == 0) { - System.err.println("Use:"); - System.err.println(" PublisherTextExtractor <file.pub>"); - } + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } - for (String arg : args) { - try (FileInputStream fis = new FileInputStream(arg)) { - PublisherTextExtractor te = new PublisherTextExtractor(fis); - System.out.println(te.getText()); - te.close(); - } - } + @Override + public HPBFDocument getFilesystem() { + return doc; } } diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java deleted file mode 100644 index 650f809253..0000000000 --- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java +++ /dev/null @@ -1,279 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ - -package org.apache.poi.hslf.extractor; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.List; - -import org.apache.poi.EncryptedDocumentException; -import org.apache.poi.extractor.POIOLE2TextExtractor; -import org.apache.poi.hslf.usermodel.HSLFObjectShape; -import org.apache.poi.hslf.usermodel.HSLFShape; -import org.apache.poi.hslf.usermodel.HSLFSlideShow; -import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl; -import org.apache.poi.hslf.usermodel.HSLFTextParagraph; -import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; -import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.sl.extractor.SlideShowExtractor; -import org.apache.poi.sl.usermodel.SlideShow; -import org.apache.poi.sl.usermodel.SlideShowFactory; -import org.apache.poi.util.Removal; - -/** - * This class can be used to extract text from a PowerPoint file. Can optionally - * also get the notes from one. - * - * @deprecated in POI 4.0.0, use {@link SlideShowExtractor} instead - */ -@SuppressWarnings("WeakerAccess") -@Deprecated -@Removal(version="5.0.0") -public final class PowerPointExtractor extends POIOLE2TextExtractor { - private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate; - - private boolean slidesByDefault = true; - private boolean notesByDefault; - private boolean commentsByDefault; - private boolean masterByDefault; - - /** - * Basic extractor. Returns all the text, and optionally all the notes - */ - public static void main(String[] args) throws IOException { - if (args.length < 1) { - System.err.println("Usage:"); - System.err.println("\tPowerPointExtractor [-notes] <file>"); - System.exit(1); - } - - boolean notes = false; - boolean comments = false; - boolean master = true; - - String file; - if (args.length > 1) { - notes = true; - file = args[1]; - if (args.length > 2) { - comments = true; - } - } else { - file = args[0]; - } - - try (PowerPointExtractor ppe = new PowerPointExtractor(file)) { - System.out.println(ppe.getText(true, notes, comments, master)); - } - } - - public PowerPointExtractor(final HSLFSlideShow slideShow) { - super(slideShow.getSlideShowImpl()); - setFilesystem(slideShow); - delegate = new SlideShowExtractor<>(slideShow); - } - - /** - * Creates a PowerPointExtractor, from a file - * - * @param fileName The name of the file to extract from - */ - public PowerPointExtractor(String fileName) throws IOException { - this(createHSLF(new File(fileName), Biff8EncryptionKey.getCurrentUserPassword(), true)); - } - - /** - * Creates a PowerPointExtractor, from an Input Stream - * - * @param iStream The input stream containing the PowerPoint document - */ - public PowerPointExtractor(InputStream iStream) throws IOException { - this(createHSLF(iStream, Biff8EncryptionKey.getCurrentUserPassword())); - } - - /** - * Creates a PowerPointExtractor, from an open POIFSFileSystem - * - * @param fs the POIFSFileSystem containing the PowerPoint document - */ - public PowerPointExtractor(POIFSFileSystem fs) throws IOException { - this(createHSLF(fs, Biff8EncryptionKey.getCurrentUserPassword())); - } - - /** - * Creates a PowerPointExtractor, from a specific place - * inside an open {@link POIFSFileSystem} - * - * @param dir the POIFS Directory containing the PowerPoint document - */ - public PowerPointExtractor(DirectoryNode dir) throws IOException { - this(new HSLFSlideShow(dir)); - } - - /** - * Creates a PowerPointExtractor, from a HSLFSlideShow - * - * @param ss the HSLFSlideShow to extract text from - */ - public PowerPointExtractor(HSLFSlideShowImpl ss) { - this(new HSLFSlideShow(ss)); - } - - /** - * Should a call to getText() return slide text? Default is yes - */ - public void setSlidesByDefault(final boolean slidesByDefault) { - this.slidesByDefault = slidesByDefault; - delegate.setSlidesByDefault(slidesByDefault); - } - - /** - * Should a call to getText() return notes text? Default is no - */ - public void setNotesByDefault(final boolean notesByDefault) { - this.notesByDefault = notesByDefault; - delegate.setNotesByDefault(notesByDefault); - } - - /** - * Should a call to getText() return comments text? Default is no - */ - public void setCommentsByDefault(final boolean commentsByDefault) { - this.commentsByDefault = commentsByDefault; - delegate.setCommentsByDefault(commentsByDefault); - } - - /** - * Should a call to getText() return text from master? Default is no - */ - public void setMasterByDefault(final boolean masterByDefault) { - this.masterByDefault = masterByDefault; - delegate.setMasterByDefault(masterByDefault); - } - - /** - * Fetches all the slide text from the slideshow, but not the notes, unless - * you've called setSlidesByDefault() and setNotesByDefault() to change this - */ - @Override - public String getText() { - return delegate.getText(); - } - - /** - * Fetches text from the slideshow, be it slide text or note text. Because - * the final block of text in a TextRun normally have their last \n - * stripped, we add it back - * - * @param getSlideText fetch slide text - * @param getNoteText fetch note text - */ - public String getText(boolean getSlideText, boolean getNoteText) { - return getText(getSlideText,getNoteText,commentsByDefault,masterByDefault); - } - - public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) { - delegate.setSlidesByDefault(getSlideText); - delegate.setNotesByDefault(getNoteText); - delegate.setCommentsByDefault(getCommentText); - delegate.setMasterByDefault(getMasterText); - try { - return delegate.getText(); - } finally { - delegate.setSlidesByDefault(slidesByDefault); - delegate.setNotesByDefault(notesByDefault); - delegate.setCommentsByDefault(commentsByDefault); - delegate.setMasterByDefault(masterByDefault); - } - } - - /** - * Fetches all the notes text from the slideshow, but not the slide text - */ - public String getNotes() { - return getText(false, true, false, false); - } - - @SuppressWarnings("unchecked") - public List<HSLFObjectShape> getOLEShapes() { - return (List<HSLFObjectShape>)delegate.getOLEShapes(); - } - - /** - * Helper method to avoid problems with compiling code in Eclipse - * - * Eclipse javac has some bugs with complex casts, this method tries - * to work around this. - * - * @param fs The {@link POIFSFileSystem} to read the document from - * @param password The password that should be used or null if no password is necessary. - * - * @return The created SlideShow - * - * @throws IOException if an error occurs while reading the data - */ - private static HSLFSlideShow createHSLF(POIFSFileSystem fs, String password) throws IOException, EncryptedDocumentException { - // Note: don't change the code here, it is required for Eclipse to compile the code - SlideShow slideShowOrig = SlideShowFactory.create(fs, password); - return (HSLFSlideShow)slideShowOrig; - } - - /** - * Helper method to avoid problems with compiling code in Eclipse - * - * Eclipse javac has some bugs with complex casts, this method tries - * to work around this. - * - * @param inp The {@link InputStream} to read data from. - * @param password The password that should be used or null if no password is necessary. - * - * @return The created SlideShow - * - * @throws IOException if an error occurs while reading the data - * @throws EncryptedDocumentException If the wrong password is given for a protected file - */ - private static HSLFSlideShow createHSLF(InputStream inp, String password) throws IOException, EncryptedDocumentException { - // Note: don't change the code here, it is required for Eclipse to compile the code - SlideShow slideShowOrig = SlideShowFactory.create(inp, password); - return (HSLFSlideShow)slideShowOrig; - } - - /** - * Helper method to avoid problems with compiling code in Eclipse - * - * Eclipse javac has some bugs with complex casts, this method tries - * to work around this. - * - * @param file The file to read data from. - * @param password The password that should be used or null if no password is necessary. - * @param readOnly If the SlideShow should be opened in read-only mode to avoid writing back - * changes when the document is closed. - * - * @return The created SlideShow - * - * @throws IOException if an error occurs while reading the data - * @throws EncryptedDocumentException If the wrong password is given for a protected file - */ - private static HSLFSlideShow createHSLF(File file, String password, boolean readOnly) throws IOException, EncryptedDocumentException { - // Note: don't change the code here, it is required for Eclipse to compile the code - SlideShow slideShowOrig = SlideShowFactory.create(file, password, readOnly); - return (HSLFSlideShow)slideShowOrig; - } -} diff --git a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java index 150326b6d0..8370f6c282 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java +++ b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.Map; import java.util.function.Supplier; +import org.apache.poi.POIDocument; import org.apache.poi.common.usermodel.GenericRecord; import org.apache.poi.common.usermodel.fonts.FontInfo; import org.apache.poi.ddf.EscherBSERecord; @@ -40,6 +41,9 @@ import org.apache.poi.ddf.EscherContainerRecord; import org.apache.poi.ddf.EscherOptRecord; import org.apache.poi.hpsf.ClassID; import org.apache.poi.hpsf.ClassIDPredefined; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.PropertySet; +import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException; import org.apache.poi.hslf.exceptions.HSLFException; @@ -47,6 +51,7 @@ import org.apache.poi.hslf.model.HeadersFooters; import org.apache.poi.hslf.model.MovieShape; import org.apache.poi.hslf.record.*; import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet; +import org.apache.poi.poifs.crypt.EncryptionInfo; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -66,7 +71,7 @@ import org.apache.poi.util.Units; * TODO: - figure out how to match notes to their correct sheet (will involve * understanding DocSlideList and DocNotesList) - handle Slide creation cleaner */ -public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord { +public final class HSLFSlideShow extends POIDocument implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord { //arbitrarily selected; may need to increase private static final int MAX_RECORD_LENGTH = 10_000_000; @@ -111,6 +116,8 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap * @param hslfSlideShow the HSLFSlideShow to base on */ public HSLFSlideShow(HSLFSlideShowImpl hslfSlideShow) { + super(hslfSlideShow.getDirectory()); + loadSavePhase.set(LoadSavePhase.INIT); // Get useful things from our base slideshow @@ -1080,7 +1087,7 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap public HPSFPropertiesExtractor getMetadataTextExtractor() { return new HPSFPropertiesExtractor(getSlideShowImpl()); } - + int addToObjListAtom(RecordContainer exObj) { ExObjList lst = getDocumentRecord().getExObjList(true); ExObjListAtom objAtom = lst.getExObjListAtom(); @@ -1097,7 +1104,7 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap Map<String,ClassID> olemap = new HashMap<>(); olemap.put(POWERPOINT_DOCUMENT, ClassIDPredefined.POWERPOINT_V8.getClassID()); // as per BIFF8 spec - olemap.put("Workbook", ClassIDPredefined.EXCEL_V8.getClassID()); + olemap.put("Workbook", ClassIDPredefined.EXCEL_V8.getClassID()); // Typically from third party programs olemap.put("WORKBOOK", ClassIDPredefined.EXCEL_V8.getClassID()); // Typically odd Crystal Reports exports @@ -1179,4 +1186,94 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap public List<? extends GenericRecord> getGenericChildren() { return Arrays.asList(_hslfSlideShow.getRecords()); } + + @Override + public void write() throws IOException { + getSlideShowImpl().write(); + } + + @Override + public void write(File newFile) throws IOException { + getSlideShowImpl().write(newFile); + } + + @Override + public DocumentSummaryInformation getDocumentSummaryInformation() { + return getSlideShowImpl().getDocumentSummaryInformation(); + } + + @Override + public SummaryInformation getSummaryInformation() { + return getSlideShowImpl().getSummaryInformation(); + } + + @Override + public void createInformationProperties() { + getSlideShowImpl().createInformationProperties(); + } + + @Override + public void readProperties() { + getSlideShowImpl().readProperties(); + } + + @Override + protected PropertySet getPropertySet(String setName) throws IOException { + return getSlideShowImpl().getPropertySetImpl(setName); + } + + @Override + protected PropertySet getPropertySet(String setName, EncryptionInfo encryptionInfo) throws IOException { + return getSlideShowImpl().getPropertySetImpl(setName, encryptionInfo); + } + + @Override + protected void writeProperties() throws IOException { + getSlideShowImpl().writePropertiesImpl(); + } + + @Override + public void writeProperties(POIFSFileSystem outFS) throws IOException { + getSlideShowImpl().writeProperties(outFS); + } + + @Override + protected void writeProperties(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException { + getSlideShowImpl().writePropertiesImpl(outFS, writtenEntries); + } + + @Override + protected void validateInPlaceWritePossible() throws IllegalStateException { + getSlideShowImpl().validateInPlaceWritePossibleImpl(); + } + + @Override + public DirectoryNode getDirectory() { + return getSlideShowImpl().getDirectory(); + } + + @Override + protected void clearDirectory() { + getSlideShowImpl().clearDirectoryImpl(); + } + + @Override + protected boolean initDirectory() { + return getSlideShowImpl().initDirectoryImpl(); + } + + @Override + protected void replaceDirectory(DirectoryNode newDirectory) { + getSlideShowImpl().replaceDirectoryImpl(newDirectory); + } + + @Override + protected String getEncryptedPropertyStreamName() { + return getSlideShowImpl().getEncryptedPropertyStreamName(); + } + + @Override + public EncryptionInfo getEncryptionInfo() throws IOException { + return getSlideShowImpl().getEncryptionInfo(); + } } diff --git a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java index 6f1c633ea3..d616180245 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java +++ b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java @@ -36,6 +36,7 @@ import java.util.NavigableMap; import java.util.TreeMap; import org.apache.poi.POIDocument; +import org.apache.poi.hpsf.PropertySet; import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException; import org.apache.poi.hslf.exceptions.HSLFException; import org.apache.poi.hslf.exceptions.OldPowerPointFormatException; @@ -714,8 +715,6 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable { } - - /* ******************* adding methods follow ********************* */ /** @@ -850,6 +849,38 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable { return "EncryptedSummary"; } + void writePropertiesImpl() throws IOException { + super.writeProperties(); + } + + PropertySet getPropertySetImpl(String setName) throws IOException { + return super.getPropertySet(setName); + } + + PropertySet getPropertySetImpl(String setName, EncryptionInfo encryptionInfo) throws IOException { + return super.getPropertySet(setName, encryptionInfo); + } + + void writePropertiesImpl(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException { + super.writeProperties(outFS, writtenEntries); + } + + void validateInPlaceWritePossibleImpl() throws IllegalStateException { + super.validateInPlaceWritePossible(); + } + + void clearDirectoryImpl() { + super.clearDirectory(); + } + + boolean initDirectoryImpl() { + return super.initDirectory(); + } + + void replaceDirectoryImpl(DirectoryNode newDirectory) { + super.replaceDirectory(newDirectory); + } + private static class BufAccessBAOS extends ByteArrayOutputStream { public byte[] getBuf() { return buf; diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java deleted file mode 100644 index 09132f639b..0000000000 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java +++ /dev/null @@ -1,61 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hsmf.extractor; - -import org.apache.poi.hsmf.MAPIMessage; -import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.util.Removal; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; - -/** - * A text extractor for HSMF (Outlook) .msg files. - * Outputs in a format somewhat like a plain text email. - * - * @deprecated use @{link OutlookTextExtractor} instead - */ -@Deprecated -@Removal(version = "5.0.0") -public class OutlookTextExtactor extends OutlookTextExtractor { - public OutlookTextExtactor(MAPIMessage msg) { - super(msg); - } - - public OutlookTextExtactor(DirectoryNode poifsDir) throws IOException { - super(new MAPIMessage(poifsDir)); - } - - public OutlookTextExtactor(POIFSFileSystem fs) throws IOException { - super(new MAPIMessage(fs)); - } - - public OutlookTextExtactor(InputStream inp) throws IOException { - super(new MAPIMessage(inp)); - } - - public static void main(String[] args) throws Exception { - for (String filename : args) { - try (POIFSFileSystem poifs = new POIFSFileSystem(new File(filename)); - OutlookTextExtractor extractor = new OutlookTextExtractor(poifs)) { - System.out.println(extractor.getText()); - } - } - } -} diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java index e11f005fe3..a818d03280 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java @@ -42,9 +42,12 @@ import org.apache.poi.util.LocaleUtil; * * @since 4.1.2 */ -public class OutlookTextExtractor extends POIOLE2TextExtractor { +public class OutlookTextExtractor implements POIOLE2TextExtractor { + private final MAPIMessage msg; + private boolean doCloseFilesystem = true; + public OutlookTextExtractor(MAPIMessage msg) { - super(msg); + this.msg = msg; } public OutlookTextExtractor(DirectoryNode poifsDir) throws IOException { @@ -76,14 +79,13 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor { * Returns the underlying MAPI message */ public MAPIMessage getMAPIMessage() { - return (MAPIMessage) document; + return msg; } /** * Outputs something a little like a RFC822 email */ public String getText() { - MAPIMessage msg = (MAPIMessage) document; StringBuilder s = new StringBuilder(); // See if we can get a suitable encoding for any @@ -201,4 +203,24 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor { } s.append("\n"); } + + @Override + public MAPIMessage getDocument() { + return msg; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public MAPIMessage getFilesystem() { + return msg; + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java index 1d509dd1c6..526d21be35 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java @@ -31,13 +31,14 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * Class to extract the text from old (Word 6 / Word 95) Word Documents. * * This should only be used on the older files, for most uses you - * should call {@link WordExtractor} which deals properly + * should call {@link WordExtractor} which deals properly * with HWPF. * * @author Nick Burch */ -public final class Word6Extractor extends POIOLE2TextExtractor { +public final class Word6Extractor implements POIOLE2TextExtractor { private HWPFOldDocument doc; + private boolean doCloseFilesystem = true; /** * Create a new Word Extractor @@ -49,12 +50,11 @@ public final class Word6Extractor extends POIOLE2TextExtractor { /** * Create a new Word Extractor - * + * * @param fs * POIFSFileSystem containing the word file */ - public Word6Extractor( POIFSFileSystem fs ) throws IOException - { + public Word6Extractor( POIFSFileSystem fs ) throws IOException { this( fs.getRoot() ); } @@ -62,14 +62,11 @@ public final class Word6Extractor extends POIOLE2TextExtractor { * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead */ @Deprecated - public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) - throws IOException - { + public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) throws IOException { this( dir ); } - public Word6Extractor( DirectoryNode dir ) throws IOException - { + public Word6Extractor( DirectoryNode dir ) throws IOException { this( new HWPFOldDocument( dir ) ); } @@ -78,7 +75,6 @@ public final class Word6Extractor extends POIOLE2TextExtractor { * @param doc The HWPFOldDocument to extract from */ public Word6Extractor(HWPFOldDocument doc) { - super(doc); this.doc = doc; } @@ -101,7 +97,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor { ret = new String[doc.getTextTable().getTextPieces().size()]; for(int i=0; i<ret.length; i++) { ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuilder().toString(); - + // Fix the line endings ret[i] = ret[i].replaceAll("\r", "\ufffe"); ret[i] = ret[i].replaceAll("\ufffe","\r\n"); @@ -111,25 +107,40 @@ public final class Word6Extractor extends POIOLE2TextExtractor { return ret; } - public String getText() - { - try - { + public String getText() { + try { WordToTextConverter wordToTextConverter = new WordToTextConverter(); wordToTextConverter.processDocument( doc ); return wordToTextConverter.getText(); - } - catch ( Exception exc ) - { + } catch ( Exception exc ) { // fall-back StringBuilder text = new StringBuilder(); - for ( String t : getParagraphText() ) - { + for ( String t : getParagraphText() ) { text.append( t ); } return text.toString(); } } + + @Override + public HWPFOldDocument getDocument() { + return doc; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public HWPFOldDocument getFilesystem() { + return doc; + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index b0216a03e3..1cd5d0d654 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -17,7 +17,6 @@ package org.apache.poi.hwpf.extractor; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; @@ -39,8 +38,9 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * * @author Nick Burch */ -public final class WordExtractor extends POIOLE2TextExtractor { - private HWPFDocument doc; +public final class WordExtractor implements POIOLE2TextExtractor { + private final HWPFDocument doc; + private boolean doCloseFilesystem = true; /** * Create a new Word Extractor @@ -73,30 +73,10 @@ public final class WordExtractor extends POIOLE2TextExtractor { * The HWPFDocument to extract from */ public WordExtractor( HWPFDocument doc ) { - super( doc ); this.doc = doc; } /** - * Command line extractor, so people will stop moaning that they can't just - * run this. - */ - public static void main( String[] args ) throws IOException { - if ( args.length == 0 ) { - System.err.println( "Use:" ); - System.err - .println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" ); - System.exit( 1 ); - } - - // Process the first argument as a file - InputStream fin = new FileInputStream( args[0] ); - try (WordExtractor extractor = new WordExtractor(fin)) { - System.out.println(extractor.getText()); - } - } - - /** * Get the text from the word file, as an array with one String per * paragraph */ @@ -142,7 +122,7 @@ public final class WordExtractor extends POIOLE2TextExtractor { return getParagraphText( r ); } - protected static String[] getParagraphText( Range r ) { + static String[] getParagraphText( Range r ) { String[] ret; ret = new String[r.numParagraphs()]; for ( int i = 0; i < ret.length; i++ ) { @@ -287,8 +267,27 @@ public final class WordExtractor extends POIOLE2TextExtractor { /** * Removes any fields (eg macros, page markers etc) from the string. */ - public static String stripFields( String text ) - { + public static String stripFields( String text ) { return Range.stripFields( text ); } + + @Override + public HWPFDocument getDocument() { + return doc; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public HWPFDocument getFilesystem() { + return doc; + } } diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java b/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java index a1db11c170..aaa1cb4be8 100644 --- a/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java @@ -19,12 +19,9 @@ package org.apache.poi.hdgf.extractor; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.PrintStream; import org.apache.poi.POIDataSamples; import org.apache.poi.hdgf.HDGFDiagram; @@ -32,7 +29,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.junit.Test; public final class TestVisioExtractor { - private static POIDataSamples _dgTests = POIDataSamples.getDiagramInstance(); + private static final POIDataSamples _dgTests = POIDataSamples.getDiagramInstance(); private final String defFilename = "Test_Visio-Some_Random_Text.vsd"; private final int defTextChunks = 5; @@ -63,7 +60,7 @@ public final class TestVisioExtractor { is3.close(); HDGFDiagram hdgf3 = new HDGFDiagram(poifs3); - + VisioTextExtractor extractor3 = new VisioTextExtractor(hdgf3); assertNotNull(extractor3); assertNotNull(extractor3.getAllText()); @@ -97,7 +94,7 @@ public final class TestVisioExtractor { @Test public void testProblemFiles() throws Exception { String[] files = { - "44594.vsd", "44594-2.vsd", + "44594.vsd", "44594-2.vsd", "ShortChunk1.vsd", "ShortChunk2.vsd", "ShortChunk3.vsd", "NegativeChunkLength.vsd", "NegativeChunkLength2.vsd" }; @@ -108,31 +105,6 @@ public final class TestVisioExtractor { } } - @Test - public void testMain() throws Exception { - PrintStream oldOut = System.out; - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - PrintStream capture = new PrintStream(baos); - System.setOut(capture); - - String path = _dgTests.getFile(defFilename).getPath(); - VisioTextExtractor.main(new String[] {path}); - - // Put things back - System.setOut(oldOut); - - // Check - capture.flush(); - String text = baos.toString(); - // YK: stdout can contain lots of other stuff if logging is sent to console - // ( -Dorg.apache.poi.util.POILogger=org.apache.poi.util.SystemOutLogger) - assertTrue( text.contains( - "text\nView\n" + - "Test View\nI am a test view\n" + - "Some random text, on a page\n" - )); - } - private VisioTextExtractor openExtractor(String fileName) throws IOException { try (InputStream is = _dgTests.openResourceAsStream(fileName)) { return new VisioTextExtractor(is); diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java index e38ef007ad..007fff036a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java @@ -42,7 +42,6 @@ import org.apache.poi.hsmf.datatypes.PropertyValue; import org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue; import org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue; import org.apache.poi.hsmf.dev.HSMFDump; -import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hsmf.extractor.OutlookTextExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.LocaleUtil; @@ -74,23 +73,23 @@ public final class TestFixedSizedProperties { fsMessageFails = new POIFSFileSystem(samples.getFile(messageFails)); mapiMessageSucceeds = new MAPIMessage(fsMessageSucceeds); - mapiMessageFails = new MAPIMessage(fsMessageFails); - + mapiMessageFails = new MAPIMessage(fsMessageFails); + messageDateFormat = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss", Locale.ROOT); - messageDateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); + messageDateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); userTimeZone = LocaleUtil.getUserTimeZone(); LocaleUtil.setUserTimeZone(LocaleUtil.TIMEZONE_UTC); } - - + + @AfterClass public static void closeFS() throws Exception { LocaleUtil.setUserTimeZone(userTimeZone); fsMessageSucceeds.close(); fsMessageFails.close(); } - + /** * Check we can find a sensible number of properties on a few * of our test files @@ -98,21 +97,21 @@ public final class TestFixedSizedProperties { @Test public void testPropertiesFound() { Map<MAPIProperty,List<PropertyValue>> props; - + props = mapiMessageSucceeds.getMainChunks().getProperties(); assertTrue(props.toString(), props.size() > 10); - + props = mapiMessageFails.getMainChunks().getProperties(); assertTrue(props.toString(), props.size() > 10); } - + /** * Check we find properties of a variety of different types */ @Test public void testPropertyValueTypes() { Chunks mainChunks = mapiMessageSucceeds.getMainChunks(); - + // Ask to have the values looked up Map<MAPIProperty,List<PropertyValue>> props = mainChunks.getProperties(); HashSet<Class<? extends PropertyValue>> seenTypes = @@ -126,7 +125,7 @@ public final class TestFixedSizedProperties { assertTrue(seenTypes.toString(), seenTypes.contains(LongPropertyValue.class)); assertTrue(seenTypes.toString(), seenTypes.contains(TimePropertyValue.class)); assertFalse(seenTypes.toString(), seenTypes.contains(ChunkBasedPropertyValue.class)); - + // Ask for the raw values seenTypes.clear(); for (PropertyValue pv : mainChunks.getRawProperties().values()) { @@ -144,31 +143,21 @@ public final class TestFixedSizedProperties { @Test public void testReadMessageDateSucceedsWithOutlookTextExtractor() throws Exception { OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageSucceeds); - ext.setFilesystem(null); // Don't close re-used test resources here - + ext.setCloseFilesystem(false); + String text = ext.getText(); assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n"); ext.close(); } - @Test - public void testReadMessageDateSucceedsWithOutlookTextExtactor() throws Exception { - OutlookTextExtactor ext = new OutlookTextExtactor(mapiMessageSucceeds); - ext.setFilesystem(null); // Don't close re-used test resources here - - String text = ext.getText(); - assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n"); - ext.close(); - } - /** * Test to see if we can read the Date Chunk with OutlookTextExtractor. */ @Test public void testReadMessageDateFailsWithOutlookTextExtractor() throws Exception { OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageFails); - ext.setFilesystem(null); // Don't close re-used test resources here - + ext.setCloseFilesystem(false); + String text = ext.getText(); assertContains(text, "Date: Thu, 21 Jun 2012 14:14:04 +0000\n"); ext.close(); @@ -182,7 +171,7 @@ public final class TestFixedSizedProperties { PrintStream stream = new PrintStream(new ByteArrayOutputStream()); HSMFDump dump = new HSMFDump(fsMessageSucceeds); dump.dump(stream); - } + } /** * Test to see if we can read the Date Chunk with HSMFDump. @@ -202,19 +191,19 @@ public final class TestFixedSizedProperties { // Check via the message date Calendar clientSubmitTime = mapiMessageSucceeds.getMessageDate(); assertEquals( - "Fri, 22 Jun 2012 18:32:54", + "Fri, 22 Jun 2012 18:32:54", messageDateFormat.format(clientSubmitTime.getTime())); - + // Fetch the property value directly Map<MAPIProperty,List<PropertyValue>> props = mapiMessageSucceeds.getMainChunks().getProperties(); - List<PropertyValue> pv = props.get(MAPIProperty.CLIENT_SUBMIT_TIME); + List<PropertyValue> pv = props.get(MAPIProperty.CLIENT_SUBMIT_TIME); assertNotNull(pv); assertEquals(1, pv.size()); - + clientSubmitTime = (Calendar)pv.get(0).getValue(); assertEquals( - "Fri, 22 Jun 2012 18:32:54", + "Fri, 22 Jun 2012 18:32:54", messageDateFormat.format(clientSubmitTime.getTime())); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java index 4d8bfb693f..2767228501 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java @@ -20,7 +20,6 @@ package org.apache.poi.hsmf.extractor; import static org.apache.poi.POITestCase.assertContains; import static org.apache.poi.POITestCase.assertNotContained; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import java.io.FileInputStream; import java.text.SimpleDateFormat; @@ -57,68 +56,62 @@ public final class TestOutlookTextExtractor { @Test public void testQuick() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - assertContains(text, "From: Kevin Roast\n"); - assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n"); - assertNotContained(text, "CC:"); - assertNotContained(text, "BCC:"); - assertNotContained(text, "Attachment:"); - assertContains(text, "Subject: Test the content transformer\n"); - Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55); - SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT); - f.setTimeZone(LocaleUtil.getUserTimeZone()); - String dateText = f.format(cal.getTime()); - assertContains(text, "Date: " + dateText + "\n"); - assertContains(text, "The quick brown fox jumps over the lazy dog"); - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); + + assertContains(text, "From: Kevin Roast\n"); + assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n"); + assertNotContained(text, "CC:"); + assertNotContained(text, "BCC:"); + assertNotContained(text, "Attachment:"); + assertContains(text, "Subject: Test the content transformer\n"); + Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55); + SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT); + f.setTimeZone(LocaleUtil.getUserTimeZone()); + String dateText = f.format(cal.getTime()); + assertContains(text, "Date: " + dateText + "\n"); + assertContains(text, "The quick brown fox jumps over the lazy dog"); + } } @Test public void testSimple() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - assertContains(text, "From: Travis Ferguson\n"); - assertContains(text, "To: travis@overwrittenstack.com\n"); - assertNotContained(text, "CC:"); - assertNotContained(text, "BCC:"); - assertContains(text, "Subject: test message\n"); - assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n"); - assertContains(text, "This is a test message."); - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); + + assertContains(text, "From: Travis Ferguson\n"); + assertContains(text, "To: travis@overwrittenstack.com\n"); + assertNotContained(text, "CC:"); + assertNotContained(text, "BCC:"); + assertContains(text, "Subject: test message\n"); + assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n"); + assertContains(text, "This is a test message."); + } } @Test public void testConstructors() throws Exception { - FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); - OutlookTextExtractor ext = new OutlookTextExtractor(fis); - String inp = ext.getText(); - ext.close(); - fis.close(); - - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); - ext = new OutlookTextExtractor(poifs); - String poifsTxt = ext.getText(); - ext.close(); - poifs.close(); - - fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); - ext = new OutlookTextExtractor(new MAPIMessage(fis)); - String mapi = ext.getText(); - ext.close(); - fis.close(); + String inp; + try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); + OutlookTextExtractor ext = new OutlookTextExtractor(fis)) { + inp = ext.getText(); + } + + String poifsTxt; + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); + OutlookTextExtractor ext = new OutlookTextExtractor(poifs)){ + poifsTxt = ext.getText(); + } + + String mapi; + try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); + OutlookTextExtractor ext = new OutlookTextExtractor(new MAPIMessage(fis))) { + mapi = ext.getText(); + } assertEquals(inp, poifsTxt); assertEquals(inp, mapi); @@ -142,25 +135,22 @@ public final class TestOutlookTextExtractor { "example_sent_regular.msg", "example_sent_unicode.msg" }; for (String file : files) { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); - MAPIMessage msg = new MAPIMessage(poifs); - - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - assertContains(text, "From: Mike Farman\n"); - assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + - "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); - assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " + - "'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); - assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " + - "'Vonka Jan' <jan.vonka@alfresco.com>\n"); - assertContains(text, "Subject: This is a test message please ignore\n"); - assertContains(text, "Date:"); - assertContains(text, "The quick brown fox jumps over the lazy dog"); - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); + + assertContains(text, "From: Mike Farman\n"); + assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + + "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); + assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " + + "'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); + assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " + + "'Vonka Jan' <jan.vonka@alfresco.com>\n"); + assertContains(text, "Subject: This is a test message please ignore\n"); + assertContains(text, "Date:"); + assertContains(text, "The quick brown fox jumps over the lazy dog"); + } } } @@ -182,25 +172,21 @@ public final class TestOutlookTextExtractor { "example_received_regular.msg", "example_received_unicode.msg" }; for (String file : files) { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); - MAPIMessage msg = new MAPIMessage(poifs); - - - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - assertContains(text, "From: Mike Farman\n"); - assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + - "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); - assertContains(text, "CC: nickb@alfresco.com; " + - "nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); - assertNotContained(text, "BCC:"); - assertContains(text, "Subject: This is a test message please ignore\n"); - assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly - assertContains(text, "The quick brown fox jumps over the lazy dog"); - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); + + assertContains(text, "From: Mike Farman\n"); + assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + + "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); + assertContains(text, "CC: nickb@alfresco.com; " + + "nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); + assertNotContained(text, "BCC:"); + assertContains(text, "Subject: This is a test message please ignore\n"); + assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly + assertContains(text, "The quick brown fox jumps over the lazy dog"); + } } } @@ -210,85 +196,59 @@ public final class TestOutlookTextExtractor { @SuppressWarnings("JavadocReference") @Test public void testWithAttachments() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - - // Check the normal bits - String text = ext.getText(); - - assertContains(text, "From: Nicolas1"); - assertContains(text, "To: 'nicolas1.23456@free.fr'"); - assertNotContained(text, "CC:"); - assertNotContained(text, "BCC:"); - assertContains(text, "Subject: test"); - assertContains(text, "Date: Wed, 22 Apr"); - assertContains(text, "Attachment: test-unicode.doc\n"); - assertContains(text, "Attachment: pj1.txt\n"); - assertContains(text, "contenu"); - - // Embeded bits are checked in - // TestExtractorFactory - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + + // Check the normal bits + String text = ext.getText(); + + assertContains(text, "From: Nicolas1"); + assertContains(text, "To: 'nicolas1.23456@free.fr'"); + assertNotContained(text, "CC:"); + assertNotContained(text, "BCC:"); + assertContains(text, "Subject: test"); + assertContains(text, "Date: Wed, 22 Apr"); + assertContains(text, "Attachment: test-unicode.doc\n"); + assertContains(text, "Attachment: pj1.txt\n"); + assertContains(text, "contenu"); + + // Embeded bits are checked in + // TestExtractorFactory + } } @Test public void testWithAttachedMessage() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - // Check we got bits from the main message - assertContains(text, "Master mail"); - assertContains(text, "ante in lacinia euismod"); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); - // But not the attached message - assertNotContained(text, "Test mail attachment"); - assertNotContained(text, "Lorem ipsum dolor sit"); + // Check we got bits from the main message + assertContains(text, "Master mail"); + assertContains(text, "ante in lacinia euismod"); - ext.close(); - poifs.close(); + // But not the attached message + assertNotContained(text, "Test mail attachment"); + assertNotContained(text, "Lorem ipsum dolor sit"); + } } @Test public void testEncodings() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - // Check the english bits - assertContains(text, "From: Tests Chang@FT"); - assertContains(text, "tests.chang@fengttt.com"); - - // And check some chinese bits - assertContains(text, "(\u5f35\u6bd3\u502b)"); - assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); - ext.close(); - poifs.close(); - } + // Check the english bits + assertContains(text, "From: Tests Chang@FT"); + assertContains(text, "tests.chang@fengttt.com"); - @Test - public void testEncodingsDeprecatedClass() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - OutlookTextExtactor ext = new OutlookTextExtactor(msg); - assertTrue("OutlookTextExtactor instanceof OutlookTextExtractor", ext instanceof OutlookTextExtractor); - String text = ext.getText(); - - // Check the english bits - assertContains(text, "From: Tests Chang@FT"); - assertContains(text, "tests.chang@fengttt.com"); - - // And check some chinese bits - assertContains(text, "(\u5f35\u6bd3\u502b)"); - assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); - - ext.close(); - poifs.close(); + // And check some chinese bits + assertContains(text, "(\u5f35\u6bd3\u502b)"); + assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); + } } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java index c605130a67..1962f2facf 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java @@ -17,16 +17,16 @@ package org.apache.poi.hwpf.extractor; -import org.apache.poi.POIDataSamples; -import org.apache.poi.extractor.POITextExtractor; -import org.apache.poi.extractor.OLE2ExtractorFactory; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.junit.Test; +import static org.junit.Assert.assertNotNull; import java.io.IOException; import java.io.InputStream; -import static org.junit.Assert.assertNotNull; +import org.apache.poi.POIDataSamples; +import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.extractor.POITextExtractor; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.junit.Test; /** * Tests for bugs with the WordExtractor @@ -61,7 +61,7 @@ public final class TestWordExtractorBugs { @Test public void testBug60374() throws Exception { POIFSFileSystem fs = new POIFSFileSystem(SAMPLES.openResourceAsStream("cn.orthodox.www_divenbog_APRIL_30-APRIL.DOC")); - final POITextExtractor extractor = OLE2ExtractorFactory.createExtractor(fs); + final POITextExtractor extractor = ExtractorFactory.createExtractor(fs); // Check it gives text without error assertNotNull(extractor.getText()); |