import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.HexDump;
import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.RLEDecompressingInputStream;
+import org.apache.poi.util.StringUtil;
/**
* <p>Finds all VBA Macros in an office file (OLE2/POIFS and OOXML/OPC),
public class VBAMacroReader implements Closeable {
protected static final String VBA_PROJECT_OOXML = "vbaProject.bin";
protected static final String VBA_PROJECT_POIFS = "VBA";
- // FIXME: When minimum supported version is Java 7, replace with java.nio.charset.StandardCharsets.UTF_16LE
- private static final Charset UTF_16LE = Charset.forName("UTF-16LE");
-
+
private NPOIFSFileSystem fs;
public VBAMacroReader(InputStream rstream) throws IOException {
}
}
protected static class ModuleMap extends HashMap<String, Module> {
- Charset charset = Charset.forName("Cp1252"); // default charset
+ Charset charset = StringUtil.WIN_1252; // default charset
}
/**
}
}
- /**
- * Read <tt>length</tt> bytes of MBCS (multi-byte character set) characters from the stream
- *
- * @param stream the inputstream to read from
- * @param length number of bytes to read from stream
- * @param charset the character set encoding of the bytes in the stream
- * @return a java String in the supplied character set
- * @throws IOException If reading from the stream fails
- */
- private static String readString(InputStream stream, int length, Charset charset) throws IOException {
- byte[] buffer = new byte[length];
- int count = stream.read(buffer);
- return new String(buffer, 0, count, charset);
- }
+
/**
* reads module from DIR node in input stream and adds it to the modules map for decompression later
* @param modules a map to store the modules
* @throws IOException If reading data from the stream or from modules fails
*/
- private static void readModule(RLEDecompressingInputStream in, String streamName, ModuleMap modules) throws IOException {
+ private static void readModuleMetadataFromDirStream(RLEDecompressingInputStream in, String streamName, ModuleMap modules) throws IOException {
int moduleOffset = in.readInt();
Module module = modules.get(streamName);
if (module == null) {
}
}
- private static void readModule(DocumentInputStream dis, String name, ModuleMap modules) throws IOException {
+ private static void readModuleFromDocumentStream(DocumentNode documentNode, String name, ModuleMap modules) throws IOException {
Module module = modules.get(name);
// TODO Refactor this to fetch dir then do the rest
if (module == null) {
// no DIR stream with offsets yet, so store the compressed bytes for later
module = new Module();
modules.put(name, module);
- module.read(dis);
+ InputStream dis = new DocumentInputStream(documentNode);
+ try {
+ module.read(dis);
+ } finally {
+ dis.close();
+ }
} else if (module.buf == null) { //if we haven't already read the bytes for the module keyed off this name...
+
if (module.offset == null) {
//This should not happen. bug 59858
throw new IOException("Module offset for '" + name + "' was never read.");
}
- // we know the offset already, so decompress immediately on-the-fly
- long skippedBytes = dis.skip(module.offset);
- if (skippedBytes != module.offset) {
- throw new IOException("tried to skip " + module.offset + " bytes, but actually skipped " + skippedBytes + " bytes");
+
+ //try the general case, where module.offset is accurate
+ InputStream decompressed = null;
+ InputStream compressed = new DocumentInputStream(documentNode);
+ try {
+ // we know the offset already, so decompress immediately on-the-fly
+ long skippedBytes = compressed.skip(module.offset);
+ if (skippedBytes != module.offset) {
+ throw new IOException("tried to skip " + module.offset + " bytes, but actually skipped " + skippedBytes + " bytes");
+ }
+ decompressed = new RLEDecompressingInputStream(compressed);
+ module.read(decompressed);
+ return;
+ } catch (IllegalArgumentException e) {
+ } catch (IllegalStateException e) {
+ } finally {
+ IOUtils.closeQuietly(compressed);
+ IOUtils.closeQuietly(decompressed);
+ }
+
+ //bad module.offset, try brute force
+ compressed = new DocumentInputStream(documentNode);
+ byte[] decompressedBytes = null;
+ try {
+ decompressedBytes = findCompressedStreamWBruteForce(compressed);
+ } finally {
+ IOUtils.closeQuietly(compressed);
+ }
+
+ if (decompressedBytes != null) {
+ module.read(new ByteArrayInputStream(decompressedBytes));
}
- InputStream stream = new RLEDecompressingInputStream(dis);
- module.read(stream);
- stream.close();
}
}
* @throws IOException If skipping would exceed the available data or skipping did not work.
*/
private static void trySkip(InputStream in, long n) throws IOException {
- long skippedBytes = in.skip(n);
+ long skippedBytes = IOUtils.skipFully(in, n);
if (skippedBytes != n) {
if (skippedBytes < 0) {
throw new IOException(
} else {
throw new IOException(
"Tried skipping " + n + " bytes, but only " + skippedBytes + " bytes were skipped. "
- + "This should never happen.");
+ + "This should never happen with a non-corrupt file.");
}
}
}
// Constants from MS-OVBA: https://msdn.microsoft.com/en-us/library/office/cc313094(v=office.12).aspx
- private static final int EOF = -1;
- private static final int VERSION_INDEPENDENT_TERMINATOR = 0x0010;
- @SuppressWarnings("unused")
- private static final int VERSION_DEPENDENT_TERMINATOR = 0x002B;
- private static final int PROJECTVERSION = 0x0009;
- private static final int PROJECTCODEPAGE = 0x0003;
- private static final int STREAMNAME = 0x001A;
- private static final int MODULEOFFSET = 0x0031;
- @SuppressWarnings("unused")
- private static final int MODULETYPE_PROCEDURAL = 0x0021;
- @SuppressWarnings("unused")
- private static final int MODULETYPE_DOCUMENT_CLASS_OR_DESIGNER = 0x0022;
- @SuppressWarnings("unused")
- private static final int PROJECTLCID = 0x0002;
- @SuppressWarnings("unused")
- private static final int MODULE_NAME = 0x0019;
- @SuppressWarnings("unused")
- private static final int MODULE_NAME_UNICODE = 0x0047;
- @SuppressWarnings("unused")
- private static final int MODULE_DOC_STRING = 0x001c;
private static final int STREAMNAME_RESERVED = 0x0032;
+ private static final int PROJECT_CONSTANTS_RESERVED = 0x003C;
+ private static final int HELP_FILE_PATH_RESERVED = 0x003D;
+ private static final int REFERENCE_NAME_RESERVED = 0x003E;
+ private static final int DOC_STRING_RESERVED = 0x0040;
+ private static final int MODULE_DOCSTRING_RESERVED = 0x0048;
/**
* Reads VBA Project modules from a VBA Project directory located at
* @since 3.15-beta2
*/
protected void readMacros(DirectoryNode macroDir, ModuleMap modules) throws IOException {
+ //bug59858 shows that dirstream may not be in this directory (\MBD00082648\_VBA_PROJECT_CUR\VBA ENTRY NAME)
+ //but may be in another directory (\_VBA_PROJECT_CUR\VBA ENTRY NAME)
+ //process the dirstream first -- "dir" is case insensitive
+ for (String entryName : macroDir.getEntryNames()) {
+ if ("dir".equalsIgnoreCase(entryName)) {
+ processDirStream(macroDir.getEntry(entryName), modules);
+ break;
+ }
+ }
+
for (Entry entry : macroDir) {
if (! (entry instanceof DocumentNode)) { continue; }
String name = entry.getName();
DocumentNode document = (DocumentNode)entry;
- DocumentInputStream dis = new DocumentInputStream(document);
+
+ if (! "dir".equalsIgnoreCase(name) && !startsWithIgnoreCase(name, "__SRP")
+ && !startsWithIgnoreCase(name, "_VBA_PROJECT")) {
+ // process module, skip __SRP and _VBA_PROJECT since these do not contain macros
+ readModuleFromDocumentStream(document, name, modules);
+ }
+ }
+ }
+
+ private enum RecordType {
+ // Constants from MS-OVBA: https://msdn.microsoft.com/en-us/library/office/cc313094(v=office.12).aspx
+ MODULE_OFFSET(0x0031),
+ PROJECT_SYS_KIND(0x01),
+ PROJECT_LCID(0x0002),
+ PROJECT_LCID_INVOKE(0x14),
+ PROJECT_CODEPAGE(0x0003),
+ PROJECT_NAME(0x04),
+ PROJECT_DOC_STRING(0x05),
+ PROJECT_HELP_FILE_PATH(0x06),
+ PROJECT_HELP_CONTEXT(0x07, 8),
+ PROJECT_LIB_FLAGS(0x08),
+ PROJECT_VERSION(0x09, 10),
+ PROJECT_CONSTANTS(0x0C),
+ PROJECT_MODULES(0x0F),
+ DIR_STREAM_TERMINATOR(0x10),
+ PROJECT_COOKIE(0x13),
+ MODULE_NAME(0x19),
+ MODULE_NAME_UNICODE(0x47),
+ MODULE_STREAM_NAME(0x1A),
+ MODULE_DOC_STRING(0x1C),
+ MODULE_HELP_CONTEXT(0x1E),
+ MODULE_COOKIE(0x2c),
+ MODULE_TYPE_PROCEDURAL(0x21, 4),
+ MODULE_TYPE_OTHER(0x22, 4),
+ MODULE_PRIVATE(0x28, 4),
+ REFERENCE_NAME(0x16),
+ REFERENCE_REGISTERED(0x0D),
+ REFERENCE_PROJECT(0x0E),
+ REFERENCE_CONTROL_A(0x2F),
+
+ //according to the spec, REFERENCE_CONTROL_B(0x33) should have the
+ //same structure as REFERENCE_CONTROL_A(0x2F).
+ //However, it seems to have the int(length) record structure that most others do.
+ //See 59830.xls for this record.
+ REFERENCE_CONTROL_B(0x33),
+ //REFERENCE_ORIGINAL(0x33),
+
+
+ MODULE_TERMINATOR(0x002B),
+ EOF(-1),
+ UNKNOWN(-2);
+
+
+ private final int VARIABLE_LENGTH = -1;
+ private final int id;
+ private final int constantLength;
+
+ RecordType(int id) {
+ this.id = id;
+ this.constantLength = VARIABLE_LENGTH;
+ }
+
+ RecordType(int id, int constantLength) {
+ this.id = id;
+ this.constantLength = constantLength;
+ }
+
+ int getConstantLength() {
+ return constantLength;
+ }
+
+ static RecordType lookup(int id) {
+ for (RecordType type : RecordType.values()) {
+ if (type.id == id) {
+ return type;
+ }
+ }
+ return UNKNOWN;
+ }
+ }
+
+
+ private enum DIR_STATE {
+ INFORMATION_RECORD,
+ REFERENCES_RECORD,
+ MODULES_RECORD
+ }
+
+ private static class ASCIIUnicodeStringPair {
+ private final String ascii;
+ private final String unicode;
+
+ ASCIIUnicodeStringPair(String ascii, String unicode) {
+ this.ascii = ascii;
+ this.unicode = unicode;
+ }
+
+ private String getAscii() {
+ return ascii;
+ }
+
+ private String getUnicode() {
+ return unicode;
+ }
+ }
+
+ private void processDirStream(Entry dir, ModuleMap modules) throws IOException {
+ DocumentNode dirDocumentNode = (DocumentNode)dir;
+ DocumentInputStream dis = new DocumentInputStream(dirDocumentNode);
+ DIR_STATE dirState = DIR_STATE.INFORMATION_RECORD;
+ try {
+ RLEDecompressingInputStream in = new RLEDecompressingInputStream(dis);
+ String streamName = null;
+ int recordId = 0;
+ boolean inReferenceTwiddled = false;
try {
- if ("dir".equalsIgnoreCase(name)) {
- // process DIR
- RLEDecompressingInputStream in = new RLEDecompressingInputStream(dis);
- String streamName = null;
- int recordId = 0;
- try {
- while (true) {
- recordId = in.readShort();
- if (EOF == recordId
- || VERSION_INDEPENDENT_TERMINATOR == recordId) {
- break;
+ while (true) {
+ recordId = in.readShort();
+ if (recordId == -1) {
+ break;
+ }
+ RecordType type = RecordType.lookup(recordId);
+
+ if (type.equals(RecordType.EOF) || type.equals(RecordType.DIR_STREAM_TERMINATOR)) {
+ break;
+ }
+ switch (type) {
+ case PROJECT_VERSION:
+ trySkip(in, RecordType.PROJECT_VERSION.getConstantLength());
+ break;
+ case PROJECT_CODEPAGE:
+ in.readInt();//record size must == 4
+ int codepage = in.readShort();
+ modules.charset = Charset.forName(CodePageUtil.codepageToEncoding(codepage, true));
+ break;
+ case MODULE_STREAM_NAME:
+ ASCIIUnicodeStringPair pair = readStringPair(in, modules.charset, STREAMNAME_RESERVED);
+ streamName = pair.getAscii();
+ break;
+ case PROJECT_DOC_STRING:
+ readStringPair(in, modules.charset, DOC_STRING_RESERVED);
+ break;
+ case PROJECT_HELP_FILE_PATH:
+ readStringPair(in, modules.charset, HELP_FILE_PATH_RESERVED);
+ break;
+ case PROJECT_CONSTANTS:
+ readStringPair(in, modules.charset, PROJECT_CONSTANTS_RESERVED);
+ break;
+ case REFERENCE_NAME:
+ if (dirState.equals(DIR_STATE.INFORMATION_RECORD)) {
+ dirState = DIR_STATE.REFERENCES_RECORD;
+ }
+ readStringPair(in, modules.charset, REFERENCE_NAME_RESERVED);
+ break;
+ case MODULE_DOC_STRING :
+ int modDocStringLength = in.readInt();
+ readString(in, modDocStringLength, modules.charset);
+ int modDocStringReserved = in.readShort();
+ if (modDocStringReserved != MODULE_DOCSTRING_RESERVED) {
+ throw new IOException("Expected x003C after stream name before Unicode stream name, but found: " +
+ Integer.toHexString(modDocStringReserved));
+ }
+ int unicodeModDocStringLength = in.readInt();
+ readUnicodeString(in, unicodeModDocStringLength);
+ // do something with this at some point
+ break;
+ case MODULE_OFFSET:
+ int modOffsetSz = in.readInt();
+ //should be 4
+ readModuleMetadataFromDirStream(in, streamName, modules);
+ break;
+ case PROJECT_MODULES:
+ dirState = DIR_STATE.MODULES_RECORD;
+ in.readInt();//size must == 2
+ in.readShort();//number of modules
+ break;
+ case REFERENCE_CONTROL_A:
+ int szTwiddled = in.readInt();
+ trySkip(in, szTwiddled);
+ int nextRecord = in.readShort();
+ //reference name is optional!
+ if (nextRecord == RecordType.REFERENCE_NAME.id) {
+ readStringPair(in, modules.charset, REFERENCE_NAME_RESERVED);
+ nextRecord = in.readShort();
+ }
+ if (nextRecord != 0x30) {
+ throw new IOException("Expected 0x30 as Reserved3 in a ReferenceControl record");
}
- int recordLength = in.readInt();
- switch (recordId) {
- case PROJECTVERSION:
- trySkip(in, 6);
- break;
- case PROJECTCODEPAGE:
- int codepage = in.readShort();
- modules.charset = Charset.forName(CodePageUtil.codepageToEncoding(codepage, true));
- break;
- case STREAMNAME:
- streamName = readString(in, recordLength, modules.charset);
- int reserved = in.readShort();
- if (reserved != STREAMNAME_RESERVED) {
- throw new IOException("Expected x0032 after stream name before Unicode stream name, but found: "+
- Integer.toHexString(reserved));
- }
- int unicodeNameRecordLength = in.readInt();
- readUnicodeString(in, unicodeNameRecordLength);
- // do something with this at some point
- break;
- case MODULEOFFSET:
- readModule(in, streamName, modules);
- break;
- default:
+ int szExtended = in.readInt();
+ trySkip(in, szExtended);
+ break;
+ case MODULE_TERMINATOR:
+ int endOfModulesReserved = in.readInt();
+ //must be 0;
+ break;
+ default:
+ if (type.getConstantLength() > -1) {
+ trySkip(in, type.getConstantLength());
+ } else {
+ int recordLength = in.readInt();
trySkip(in, recordLength);
- break;
}
- }
- } catch (final IOException e) {
- throw new IOException(
- "Error occurred while reading macros at section id "
- + recordId + " (" + HexDump.shortToHex(recordId) + ")", e);
+ break;
}
- finally {
- in.close();
- }
- } else if (!startsWithIgnoreCase(name, "__SRP")
- && !startsWithIgnoreCase(name, "_VBA_PROJECT")) {
- // process module, skip __SRP and _VBA_PROJECT since these do not contain macros
- readModule(dis, name, modules);
}
+ } catch (final IOException e) {
+ throw new IOException(
+ "Error occurred while reading macros at section id "
+ + recordId + " (" + HexDump.shortToHex(recordId) + ")", e);
+ } finally {
+ in.close();
}
- finally {
- dis.close();
- }
+ } finally {
+ dis.close();
+ }
+ }
+
+ private ASCIIUnicodeStringPair readStringPair(RLEDecompressingInputStream in, Charset charset, int reservedByte) throws IOException {
+ int nameLength = in.readInt();
+ String ascii = readString(in, nameLength, charset);
+ int reserved = in.readShort();
+ if (reserved != reservedByte) {
+ throw new IOException("Expected "+Integer.toHexString(reservedByte)+ "after name before Unicode name, but found: " +
+ Integer.toHexString(reserved));
+ }
+ int unicodeNameRecordLength = in.readInt();
+ String unicode = readUnicodeString(in, unicodeNameRecordLength);
+ return new ASCIIUnicodeStringPair(ascii, unicode);
+ }
+
+
+ /**
+ * Read <tt>length</tt> bytes of MBCS (multi-byte character set) characters from the stream
+ *
+ * @param stream the inputstream to read from
+ * @param length number of bytes to read from stream
+ * @param charset the character set encoding of the bytes in the stream
+ * @return a java String in the supplied character set
+ * @throws IOException If reading from the stream fails
+ */
+ private static String readString(InputStream stream, int length, Charset charset) throws IOException {
+ byte[] buffer = IOUtils.safelyAllocate(length, 20000);
+ int bytesRead = IOUtils.readFully(stream, buffer);
+ if (bytesRead != length) {
+ throw new IOException("Tried to read: "+length +
+ ", but could only read: "+bytesRead);
}
+ return new String(buffer, 0, length, charset);
}
private String readUnicodeString(RLEDecompressingInputStream in, int unicodeNameRecordLength) throws IOException {
- byte[] buffer = new byte[unicodeNameRecordLength];
- IOUtils.readFully(in, buffer);
- return new String(buffer, UTF_16LE);
+ byte[] buffer = IOUtils.safelyAllocate(unicodeNameRecordLength, 20000);
+ int bytesRead = IOUtils.readFully(in, buffer);
+ if (bytesRead != unicodeNameRecordLength) {
+
+ }
+ return new String(buffer, StringUtil.UTF16LE);
+ }
+
+ /**
+ * Sometimes the offset record in the dirstream is incorrect, but the macro can still be found.
+ * This will try to find the the first RLEDecompressing stream that starts with "Attribute".
+ * This relies on some, er, heuristics, admittedly.
+ *
+ * @param is full module inputstream to read
+ * @return uncompressed bytes if found, <code>null</code> otherwise
+ * @throws IOException for a true IOException copying the is to a byte array
+ */
+ private static byte[] findCompressedStreamWBruteForce(InputStream is) throws IOException {
+ //buffer to memory for multiple tries
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(is, bos);
+ byte[] compressed = bos.toByteArray();
+ byte[] decompressed = null;
+ for (int i = 0; i < compressed.length; i++) {
+ if (compressed[i] == 0x01 && i < compressed.length-1) {
+ int w = LittleEndian.getUShort(compressed, i+1);
+ if (w <= 0 || (w & 0x7000) != 0x3000) {
+ continue;
+ }
+ decompressed = tryToDecompress(new ByteArrayInputStream(compressed, i, compressed.length - i));
+ if (decompressed != null) {
+ if (decompressed.length > 9) {
+ //this is a complete hack. The challenge is that there
+ //can be many 0 length or junk streams that are uncompressed
+ //look in the first 20 characters for "Attribute"
+ int firstX = Math.min(20, decompressed.length);
+ String start = new String(decompressed, 0, firstX, StringUtil.WIN_1252);
+ if (start.contains("Attribute")) {
+ return decompressed;
+ }
+ }
+ }
+ }
+ }
+ return decompressed;
+ }
+
+ private static byte[] tryToDecompress(InputStream is) {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ try {
+ IOUtils.copy(new RLEDecompressingInputStream(is), bos);
+ } catch (IllegalArgumentException e){
+ return null;
+ } catch (IllegalStateException e) {
+ return null;
+ } catch (IOException e) {
+ return null;
+ }
+ return bos.toByteArray();
}
}