From 56254a17c477d059d1eb716ba60f251d10f89b8a Mon Sep 17 00:00:00 2001 From: Dominik Stadler Date: Wed, 4 Oct 2017 19:54:21 +0000 Subject: [PATCH] Add some more code from the separate integration test project to be able to publish the femaining functionality as separate project at some point git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1811144 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/BaseIntegrationTest.java | 138 ++++++++++++++++++ .../org/apache/poi/TestAllFiles.java | 8 +- .../apache/poi/stress/FileHandlerFactory.java | 120 +++++++++++++++ 3 files changed, 262 insertions(+), 4 deletions(-) create mode 100644 src/integrationtest/org/apache/poi/BaseIntegrationTest.java create mode 100644 src/integrationtest/org/apache/poi/stress/FileHandlerFactory.java diff --git a/src/integrationtest/org/apache/poi/BaseIntegrationTest.java b/src/integrationtest/org/apache/poi/BaseIntegrationTest.java new file mode 100644 index 0000000000..291618f680 --- /dev/null +++ b/src/integrationtest/org/apache/poi/BaseIntegrationTest.java @@ -0,0 +1,138 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi; + +import org.apache.poi.hslf.exceptions.OldPowerPointFormatException; +import org.apache.poi.hssf.OldExcelFormatException; +import org.apache.poi.hwpf.OldWordFileFormatException; +import org.apache.poi.poifs.filesystem.OfficeXmlFileException; +import org.apache.poi.stress.*; +import org.junit.Assume; + +import java.io.*; +import java.util.zip.ZipException; + +import static org.junit.Assert.assertNotNull; + +public class BaseIntegrationTest { + private final File rootDir; + private String file; + private FileHandler handler; + + public BaseIntegrationTest(File rootDir, String file, FileHandler handler) { + this.rootDir = rootDir; + this.file = file; + this.handler = handler; + } + + public void test() throws Exception { + assertNotNull("Unknown file extension for file: " + file + ": " + TestAllFiles.getExtension(file), handler); + + File inputFile = new File(rootDir, file); + try { + handleFile(inputFile); + } catch (OfficeXmlFileException e) { + // check if the file-extension is wrong + if(!e.getMessage().contains("data appears to be in the Office 2007")) { + throw e; + } + + // use XWPF instead of HWPF and XSSF instead of HSSF as the file seems to have the wrong extension + handleWrongExtension(inputFile, e); + } catch (OldWordFileFormatException | OldExcelFormatException | OldPowerPointFormatException e) { + // at least perform extracting tests on these old files + } catch (OldFileFormatException e) { + // Not even text extraction is supported for these: handler.handleExtracting(inputFile); + //noinspection ConstantConditions + Assume.assumeFalse("File " + file + " excluded because it is unsupported old Excel format", true); + } catch (EncryptedDocumentException e) { + // Do not try to read encrypted files + //noinspection ConstantConditions + Assume.assumeFalse("File " + file + " excluded because it is password-encrypted", true); + } catch (ZipException e) { + // some files are corrupted + if (e.getMessage().equals("unexpected EOF")) { + //noinspection ConstantConditions + Assume.assumeFalse("File " + file + " excluded because the Zip file is incomplete", true); + } + + throw e; + } catch (IOException e) { + // sometimes binary format has XML-format-extension... + if(e.getMessage().contains("rong file format or file extension for OO XML file")) { + handleWrongExtension(inputFile, e); + } else { + throw e; + } + } catch (IllegalArgumentException e) { + // ignore errors for documents with incorrect extension + String message = e.getMessage(); + if(message != null && (message.equals("The document is really a RTF file") || + message.equals("The document is really a PDF file") || + message.equals("The document is really a HTML file"))) { + //noinspection ConstantConditions + Assume.assumeFalse("File " + file + " excluded because it is actually a PDF/RTF file", true); + } + + if(e.getMessage().equals("The document is really a OOXML file")) { + handleWrongExtension(inputFile, e); + } else { + throw e; + } + } + + try { + handler.handleExtracting(inputFile); + } catch (EncryptedDocumentException e) { + // Do not try to read encrypted files + //noinspection ConstantConditions + Assume.assumeFalse("File " + file + " excluded because it is password-encrypted", true); + } + } + + void handleWrongExtension(File inputFile, Exception e) throws Exception { + // use XWPF instead of HWPF and XSSF instead of HSSF as the file seems to have the wrong extension + if (handler instanceof HWPFFileHandler) { + handler = TestAllFiles.HANDLERS.get(".docx"); + handleFile(inputFile); + } else if (handler instanceof HSSFFileHandler) { + handler = TestAllFiles.HANDLERS.get(".xlsx"); + handleFile(inputFile); + } else if (handler instanceof HSLFFileHandler) { + handler = TestAllFiles.HANDLERS.get(".pptx"); + handleFile(inputFile); + // and the other way around, use HWPF instead of XWPF and so forth + } else if(handler instanceof XWPFFileHandler) { + handler = TestAllFiles.HANDLERS.get(".doc"); + handleFile(inputFile); + } else if(handler instanceof XSSFFileHandler) { + handler = TestAllFiles.HANDLERS.get(".xls"); + handleFile(inputFile); + } else if(handler instanceof XSLFFileHandler) { + handler = TestAllFiles.HANDLERS.get(".ppt"); + handleFile(inputFile); + } else { + throw e; + } + } + + private void handleFile(File inputFile) throws Exception { + try (InputStream newStream = new BufferedInputStream(new FileInputStream(inputFile), 64*1024)) { + handler.handleFile(newStream, inputFile.getAbsolutePath()); + } + } +} diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java index 37aa464ee1..812e6c6b4d 100644 --- a/src/integrationtest/org/apache/poi/TestAllFiles.java +++ b/src/integrationtest/org/apache/poi/TestAllFiles.java @@ -91,13 +91,13 @@ import org.junit.runners.Parameterized.Parameters; public class TestAllFiles { private static final File ROOT_DIR = new File("test-data"); - static final String[] SCAN_EXCLUDES = new String[] { "**/.svn/**", "lost+found", "**/.git/**" }; + public static final String[] SCAN_EXCLUDES = new String[] { "**/.svn/**", "lost+found", "**/.git/**" }; private static final Map FILE_PASSWORD; // map file extensions to the actual mappers - static final Map HANDLERS = new HashMap<>(); + public static final Map HANDLERS = new HashMap<>(); static { // Excel HANDLERS.put(".xls", new HSSFFileHandler()); @@ -443,7 +443,7 @@ public class TestAllFiles { handler.handleAdditional(inputFile); } - static String getExtension(String file) { + public static String getExtension(String file) { int pos = file.lastIndexOf('.'); if(pos == -1 || pos == file.length()-1) { return file; @@ -452,7 +452,7 @@ public class TestAllFiles { return file.substring(pos).toLowerCase(Locale.ROOT); } - private static class NullFileHandler implements FileHandler { + public static class NullFileHandler implements FileHandler { @Override public void handleFile(InputStream stream, String path) throws Exception { } diff --git a/src/integrationtest/org/apache/poi/stress/FileHandlerFactory.java b/src/integrationtest/org/apache/poi/stress/FileHandlerFactory.java new file mode 100644 index 0000000000..8be52b35a6 --- /dev/null +++ b/src/integrationtest/org/apache/poi/stress/FileHandlerFactory.java @@ -0,0 +1,120 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.stress; + +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +public class FileHandlerFactory { + // map from patterns for mimetypes to the FileHandlers that should be able to + // work with that file + // use a Set to have a defined order of applying the matches + private static final Map MIME_TYPES = new HashMap<>(); + static { + ////////////////// Word + + MIME_TYPES.put(Pattern.compile("application/vnd.ms-word.document.macroenabled.12"), new XWPFFileHandler()); + MIME_TYPES.put(Pattern.compile("application/vnd.ms-word.template.macroenabled.12"), new XWPFFileHandler()); + + // application/msword + MIME_TYPES.put(Pattern.compile(".*msword.*"), new HWPFFileHandler()); + // application/vnd.ms-word + MIME_TYPES.put(Pattern.compile(".*ms-word.*"), new HWPFFileHandler()); + + // application/vnd.openxmlformats-officedocument.wordprocessingml.document + MIME_TYPES.put(Pattern.compile(".*wordprocessingml.*"), new XWPFFileHandler()); + + ////////////////// Excel + MIME_TYPES.put(Pattern.compile("application/vnd.ms-excel.addin.macroEnabled.12"), new XSSFFileHandler()); + MIME_TYPES.put(Pattern.compile("application/vnd.ms-excel.sheet.binary.macroEnabled.12"), new XSSFFileHandler()); + + // application/msexcel + MIME_TYPES.put(Pattern.compile(".*msexcel.*"), new HSSFFileHandler()); + // application/vnd.ms-excel + MIME_TYPES.put(Pattern.compile(".*ms-excel.*"), new HSSFFileHandler()); + + // application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + MIME_TYPES.put(Pattern.compile(".*spreadsheetml.*"), new XSSFFileHandler()); + + ////////////////// Powerpoint + + // application/vnd.ms-powerpoint + MIME_TYPES.put(Pattern.compile("application/vnd.ms-powerpoint"), new HSLFFileHandler()); + // application/vnd.ms-officetheme + MIME_TYPES.put(Pattern.compile("application/vnd.ms-officetheme"), new HSLFFileHandler()); + + // application/vnd.openxmlformats-officedocument.presentationml.presentation + MIME_TYPES.put(Pattern.compile(".*presentationml.*"), new XSLFFileHandler()); + // application/vnd.ms-powerpoint.presentation.macroenabled.12 + MIME_TYPES.put(Pattern.compile("application/vnd.ms-powerpoint.presentation.macroenabled.12"), new XSLFFileHandler()); + // application/vnd.ms-powerpoint.slideshow.macroenabled.12 + MIME_TYPES.put(Pattern.compile("application/vnd.ms-powerpoint.slideshow.macroenabled.12"), new XSLFFileHandler()); + + ////////////////// Mail/TNEF + + // application/vnd.ms-tnef + MIME_TYPES.put(Pattern.compile(".*ms-tnef.*"), new HMEFFileHandler()); + + // application/vnd.ms-outlook + MIME_TYPES.put(Pattern.compile("application/vnd.ms-outlook"), new HSMFFileHandler()); + + ////////////////// Visio + + // application/vnd.visio + MIME_TYPES.put(Pattern.compile("application/vnd.visio.*"), new HDGFFileHandler()); + + // application/vnd.ms-visio.drawing + MIME_TYPES.put(Pattern.compile(".*vnd.ms-visio\\."), new XDGFFileHandler()); + + //application/vnd.ms-visio.viewer + MIME_TYPES.put(Pattern.compile(".*visio.*"), new HDGFFileHandler()); + + + ////////////////// Publisher + + // application/x-mspublisher + MIME_TYPES.put(Pattern.compile("application/x-mspublisher"), new HPBFFileHandler()); + + + ////////////////// Others + + // special type used by Tika + MIME_TYPES.put(Pattern.compile("application/x-tika-ooxml.*"), new OPCFileHandler()); + // special type used by Tika + MIME_TYPES.put(Pattern.compile("application/x-tika-msoffice.*"), new POIFSFileHandler()); + + // application/x-tika-old-excel + MIME_TYPES.put(Pattern.compile("application/x-tika-old-excel"), new POIFSFileHandler()); + + // application/vnd.openxmlformats-officedocument.drawingml.chart+xml + // ?!MIME_TYPES.put(Pattern.compile(".*drawingml.*"), ".dwg"); + + // application/vnd.openxmlformats-officedocument.vmlDrawing + // ?!MIME_TYPES.put(Pattern.compile(".*vmlDrawing.*"), ".dwg"); + } + + public static FileHandler getHandler(String mimeType) { + for(Map.Entry entry : MIME_TYPES.entrySet()) { + if(entry.getKey().matcher(mimeType).matches()) { + return entry.getValue(); + } + } + + return null; + } +} -- 2.39.5