From 9deca1afc73da9ace1f0062c491a35322accd4ca Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sun, 10 Apr 2016 11:16:49 +0000 Subject: [PATCH] VBA extraction support from bug #52949 from Barry Lagerweij git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1738418 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/poifs/macros/VBAMacroExtractor.java | 188 ++++++++++++ .../poi/util/RLEDecompressingInputStream.java | 273 ++++++++++++++++++ 2 files changed, 461 insertions(+) create mode 100644 src/java/org/apache/poi/poifs/macros/VBAMacroExtractor.java create mode 100644 src/java/org/apache/poi/util/RLEDecompressingInputStream.java diff --git a/src/java/org/apache/poi/poifs/macros/VBAMacroExtractor.java b/src/java/org/apache/poi/poifs/macros/VBAMacroExtractor.java new file mode 100644 index 0000000000..a13f76db54 --- /dev/null +++ b/src/java/org/apache/poi/poifs/macros/VBAMacroExtractor.java @@ -0,0 +1,188 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.poifs.macros; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Map; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.poi.poifs.eventfilesystem.POIFSReader; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; +import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; +import org.apache.poi.poifs.filesystem.DocumentInputStream; +import org.apache.poi.util.IOUtils; +import org.apache.poi.util.RLEDecompressingInputStream; + +/** + * This class is able to extract the source of all VBA Modules of an Excel file. + */ +public class VBAMacroExtractor { + + /** + * Extract macros from XLSM or XLS file. Automatically detects ZIP (XLSM, DOCX, etc) files. + * + * @param in + * @return + * @throws IOException + */ + public Map extractMacros(InputStream in) throws IOException { + PushbackInputStream bpin = new PushbackInputStream(in, 2); + byte[] header = new byte[2]; + if (bpin.read(header) != 2) { + throw new IllegalArgumentException("Invalid InputStream: cannot read 2 bytes"); + } + bpin.unread(header); + if (header[0] == 'P' && header[1] == 'K') { + ZipInputStream zis = new ZipInputStream(bpin); + ZipEntry zipEntry; + while ((zipEntry = zis.getNextEntry()) != null) { + if ("xl/vbaProject.bin".equals(zipEntry.getName())) { + try { + return extractMacrosFromPOIFSInputStream(zis); + } finally { + zis.closeEntry(); + } + } + } + return null; + } else { + return extractMacrosFromPOIFSInputStream(bpin); + } + } + + /** + * Extracts all macros from all modules of the provided input stream. The stream is assumed to be in POIFS format (i.e. XLS file itself or + * vbaProject.bin from OOXML files) + * + * @param in + * @return + * @throws IOException + */ + public Map extractMacrosFromPOIFSInputStream(InputStream in) throws IOException { + class Module { + + Integer offset; + byte[] buf; + } + class ModuleMap extends HashMap { + + Charset charset = Charset.forName("Cp1252"); // default charset + } + try { + final ModuleMap modules = new ModuleMap(); + POIFSReader dirReader = new POIFSReader(); + dirReader.registerListener(new POIFSReaderListener() { + + public void processPOIFSReaderEvent(POIFSReaderEvent event) { + try { + String name = event.getName(); + if (event.getPath().toString().endsWith("\\VBA")) { + if ("dir".equals(name)) { + // process DIR + RLEDecompressingInputStream in = new RLEDecompressingInputStream(event.getStream()); + String streamName = null; + while (true) { + int id = in.readShort(); + if (id == -1 || id == 0x0010) { + break; // EOF or TERMINATOR + } + int len = in.readInt(); + switch (id) { + case 0x0009: // PROJECTVERSION + in.skip(6); + break; + case 0x0003: // PROJECTCODEPAGE + int codepage = in.readShort(); + modules.charset = Charset.forName("Cp" + codepage); + break; + case 0x001A: // STREAMNAME + byte[] streamNameBuf = new byte[len]; + int count = in.read(streamNameBuf); + streamName = new String(streamNameBuf, 0, count, modules.charset); + break; + case 0x0031: // MODULEOFFSET + int moduleOffset = in.readInt(); + Module module = modules.get(streamName); + if (module != null) { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + RLEDecompressingInputStream stream = new RLEDecompressingInputStream(new ByteArrayInputStream( + module.buf, moduleOffset, module.buf.length - moduleOffset)); + IOUtils.copy(stream, out); + stream.close(); + out.close(); + module.buf = out.toByteArray(); + } else { + module = new Module(); + module.offset = moduleOffset; + modules.put(streamName, module); + } + break; + default: + in.skip(len); + break; + } + } + } else if (!name.startsWith("__SRP") && !name.startsWith("_VBA_PROJECT")) { + // process module, skip __SRP and _VBA_PROJECT since these do not contain macros + Module module = modules.get(name); + final DocumentInputStream stream = event.getStream(); + final InputStream in; + if (module == null) { + // no DIR stream with offsets yet, so store the compressed bytes for later + module = new Module(); + modules.put(name, module); + in = stream; + } else { + // we know the offset already, so decompress immediately on-the-fly + stream.skip(module.offset); + in = new RLEDecompressingInputStream(stream); + } + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + IOUtils.copy(in, out); + in.close(); + out.close(); + module.buf = out.toByteArray(); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }); + dirReader.read(in); + Map moduleSources = new HashMap(); + for (Map.Entry entry : modules.entrySet()) { + Module module = entry.getValue(); + if (module.buf != null && module.buf.length > 0) { // Skip empty modules + moduleSources.put(entry.getKey(), new String(module.buf, modules.charset)); + } + } + return moduleSources; + } catch (IOException e) { + e.printStackTrace(); + throw e; + } + } +} diff --git a/src/java/org/apache/poi/util/RLEDecompressingInputStream.java b/src/java/org/apache/poi/util/RLEDecompressingInputStream.java new file mode 100644 index 0000000000..c482fc6cd5 --- /dev/null +++ b/src/java/org/apache/poi/util/RLEDecompressingInputStream.java @@ -0,0 +1,273 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.util; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Wrapper of InputStream which provides Run Length Encoding (RLE) + * decompression on the fly. Uses MS-OVBA decompression algorithm. See + * http://download.microsoft.com/download/2/4/8/24862317-78F0-4C4B-B355-C7B2C1D997DB/[MS-OVBA].pdf + */ +public class RLEDecompressingInputStream extends InputStream { + + /** + * Bitmasks for performance + */ + private static final int[] POWER2 = new int[] { 0x0001, // 0 + 0x0002, // 1 + 0x0004, // 2 + 0x0008, // 3 + 0x0010, // 4 + 0x0020, // 5 + 0x0040, // 6 + 0x0080, // 7 + 0x0100, // 8 + 0x0200, // 9 + 0x0400, // 10 + 0x0800, // 11 + 0x1000, // 12 + 0x2000, // 13 + 0x4000, // 14 + 0x8000 // 15 + }; + + /** the wrapped inputstream */ + private InputStream in; + + /** a byte buffer with size 4096 for storing a single chunk */ + private byte[] buf; + + /** the current position in the byte buffer for reading */ + private int pos; + + /** the number of bytes in the byte buffer */ + private int len; + + /** + * Creates a new wrapper RLE Decompression InputStream. + * + * @param in + * @throws IOException + */ + public RLEDecompressingInputStream(InputStream in) throws IOException { + this.in = in; + buf = new byte[4096]; + pos = 0; + int header = in.read(); + if (header != 0x01) { + throw new IllegalArgumentException(String.format("Header byte 0x01 expected, received 0x%02X", header & 0xFF)); + } + len = readChunk(); + } + + @Override + public int read() throws IOException { + if (len == -1) { + return -1; + } + if (pos >= len) { + if ((len = readChunk()) == -1) { + return -1; + } + } + return buf[pos++]; + } + + @Override + public int read(byte[] b) throws IOException { + return read(b, 0, b.length); + } + + @Override + public int read(byte[] b, int off, int l) throws IOException { + if (len == -1) { + return -1; + } + int offset = off; + int length = l; + while (length > 0) { + if (pos >= len) { + if ((len = readChunk()) == -1) { + return offset > off ? offset - off : -1; + } + } + int c = Math.min(length, len - pos); + System.arraycopy(buf, pos, b, offset, c); + pos += c; + length -= c; + offset += c; + } + return l; + } + + @Override + public long skip(long n) throws IOException { + long length = n; + while (length > 0) { + if (pos >= len) { + if ((len = readChunk()) == -1) { + return -1; + } + } + int c = (int) Math.min(n, len - pos); + pos += c; + length -= c; + } + return n; + } + + @Override + public int available() { + return (len > 0 ? len - pos : 0); + } + + @Override + public void close() throws IOException { + in.close(); + } + + /** + * Reads a single chunk from the underlying inputstream. + * + * @return + * @throws IOException + */ + private int readChunk() throws IOException { + pos = 0; + int w = readShort(in); + if (w == -1) { + return -1; + } + int chunkSize = (w & 0x0FFF) + 1; // plus 3 bytes minus 2 for the length + if ((w & 0x7000) != 0x3000) { + throw new IllegalArgumentException(String.format("Chunksize header A should be 0x3000, received 0x%04X", w & 0xE000)); + } + boolean rawChunk = (w & 0x8000) == 0; + if (rawChunk) { + if (in.read(buf, 0, chunkSize) < chunkSize) { + throw new IllegalStateException(String.format("Not enough bytes read, expected %d", chunkSize)); + } + return chunkSize; + } else { + int inOffset = 0; + int outOffset = 0; + while (inOffset < chunkSize) { + int tokenFlags = in.read(); + inOffset++; + if (tokenFlags == -1) { + break; + } + for (int n = 0; n < 8; n++) { + if (inOffset >= chunkSize) { + break; + } + if ((tokenFlags & POWER2[n]) == 0) { + // literal + final int b = in.read(); + if (b == -1) { + return -1; + } + buf[outOffset++] = (byte) b; + inOffset++; + } else { + // compressed token + int token = readShort(in); + if (token == -1) { + return -1; + } + inOffset += 2; + int copyLenBits = getCopyLenBits(outOffset - 1); + int copyOffset = (token >> (copyLenBits)) + 1; + int copyLen = (token & (POWER2[copyLenBits] - 1)) + 3; + int startPos = outOffset - copyOffset; + int endPos = startPos + copyLen; + for (int i = startPos; i < endPos; i++) { + buf[outOffset++] = buf[i]; + } + } + } + } + return outOffset; + } + } + + /** + * Helper method to determine how many bits in the CopyToken are used for the CopyLength. + * + * @param offset + * @return + */ + static int getCopyLenBits(int offset) { + for (int n = 11; n >= 4; n--) { + if ((offset & POWER2[n]) != 0) { + return 15 - n; + } + } + return 12; + } + + /** + * Convenience method for read a 2-bytes short in little endian encoding. + * + * @return + * @throws IOException + */ + public int readShort() throws IOException { + return readShort(this); + } + + /** + * Convenience method for read a 4-bytes int in little endian encoding. + * + * @return + * @throws IOException + */ + public int readInt() throws IOException { + return readInt(this); + } + + private int readShort(InputStream stream) throws IOException { + int b0, b1; + if ((b0 = stream.read()) == -1) { + return -1; + } + if ((b1 = stream.read()) == -1) { + return -1; + } + return (b0 & 0xFF) | ((b1 & 0xFF) << 8); + } + + private int readInt(InputStream stream) throws IOException { + int b0, b1, b2, b3; + if ((b0 = stream.read()) == -1) { + return -1; + } + if ((b1 = stream.read()) == -1) { + return -1; + } + if ((b2 = stream.read()) == -1) { + return -1; + } + if ((b3 = stream.read()) == -1) { + return -1; + } + return (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24); + } +} -- 2.39.5