diff options
author | Andreas Beeker <kiwiwings@apache.org> | 2017-05-02 23:32:13 +0000 |
---|---|---|
committer | Andreas Beeker <kiwiwings@apache.org> | 2017-05-02 23:32:13 +0000 |
commit | 24f2dcf6cb4491e1661140319fdf81fcb1bf483f (patch) | |
tree | 47caeb083af911a1e35ac9122642588f88e9ac36 | |
parent | 47a71f0396ea225a5086ea7827e7751e0361ffde (diff) | |
download | poi-24f2dcf6cb4491e1661140319fdf81fcb1bf483f.tar.gz poi-24f2dcf6cb4491e1661140319fdf81fcb1bf483f.zip |
#52372 - OutOfMemoryError parsing a word file
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1793602 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r-- | src/java/org/apache/poi/hpsf/Section.java | 473 | ||||
-rw-r--r-- | test-data/hpsf/TestBug52372.doc | bin | 0 -> 87040 bytes |
2 files changed, 245 insertions, 228 deletions
diff --git a/src/java/org/apache/poi/hpsf/Section.java b/src/java/org/apache/poi/hpsf/Section.java index 830d94465b..59150afb53 100644 --- a/src/java/org/apache/poi/hpsf/Section.java +++ b/src/java/org/apache/poi/hpsf/Section.java @@ -22,7 +22,11 @@ import java.io.IOException; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import org.apache.commons.collections4.bidimap.TreeBidiMap; @@ -30,11 +34,16 @@ import org.apache.poi.hpsf.wellknown.PropertyIDMap; import org.apache.poi.hpsf.wellknown.SectionIDMap; import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.LittleEndianByteArrayInputStream; +import org.apache.poi.util.LittleEndianConsts; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * Represents a section in a {@link PropertySet}. */ public class Section { + private static final POILogger LOG = POILogFactory.getLogger(Section.class); /** * Maps property IDs to section-private PID strings. These @@ -62,7 +71,7 @@ public class Section { /** * The offset of the section in the stream. */ - private long offset = -1; + private final long _offset; /** * The section's size in bytes. @@ -72,7 +81,7 @@ public class Section { /** * This section's properties. */ - private final Map<Long,Property> properties = new TreeMap<Long,Property>(); + private final Map<Long,Property> properties = new LinkedHashMap<Long,Property>(); /** * This member is {@code true} if the last call to {@link @@ -85,6 +94,7 @@ public class Section { * Creates an empty {@link Section}. */ public Section() { + this._offset = -1; } /** @@ -96,6 +106,7 @@ public class Section { * @param s The section set to copy */ public Section(final Section s) { + this._offset = -1; setFormatID(s.getFormatID()); for (Property p : s.properties.values()) { properties.put(p.getID(), new MutableProperty(p)); @@ -117,32 +128,38 @@ public class Section { */ @SuppressWarnings("unchecked") public Section(final byte[] src, final int offset) throws UnsupportedEncodingException { - int o1 = offset; - /* * Read the format ID. */ - formatID = new ClassID(src, o1); - o1 += ClassID.LENGTH; + formatID = new ClassID(src, offset); /* * Read the offset from the stream's start and positions to * the section header. */ - this.offset = LittleEndian.getUInt(src, o1); - o1 = (int) this.offset; + int offFix = (int)LittleEndian.getUInt(src, offset + ClassID.LENGTH); + + // some input files have a invalid (padded?) offset, which need to be fixed + // search for beginning of size field + if (src[offFix] == 0) { + for (int i=0; i<3 && src[offFix] == 0; i++,offFix++); + // cross check with propertyCount field and the property list field + for (int i=0; i<3 && (src[offFix+3] != 0 || src[offFix+7] != 0 || src[offFix+11] != 0); i++,offFix--); + } + + this._offset = offFix; + + LittleEndianByteArrayInputStream leis = new LittleEndianByteArrayInputStream(src, offFix); /* * Read the section length. */ - size = (int) LittleEndian.getUInt(src, o1); - o1 += LittleEndian.INT_SIZE; + size = (int)leis.readUInt(); /* * Read the number of properties. */ - final int propertyCount = (int) LittleEndian.getUInt(src, o1); - o1 += LittleEndian.INT_SIZE; + final int propertyCount = (int)leis.readUInt(); /* * Read the properties. The offset is positioned at the first @@ -169,64 +186,65 @@ public class Section { * seconds pass reads the other properties. */ /* Pass 1: Read the property list. */ - int pass1Offset = o1; - long cpOffset = -1; final TreeBidiMap<Long,Long> offset2Id = new TreeBidiMap<Long,Long>(); for (int i = 0; i < propertyCount; i++) { /* Read the property ID. */ - long id = LittleEndian.getUInt(src, pass1Offset); - pass1Offset += LittleEndian.INT_SIZE; + long id = (int)leis.readUInt(); /* Offset from the section's start. */ - long off = LittleEndian.getUInt(src, pass1Offset); - pass1Offset += LittleEndian.INT_SIZE; + long off = (int)leis.readUInt(); offset2Id.put(off, id); - - if (id == PropertyIDMap.PID_CODEPAGE) { - cpOffset = off; - } } + Long cpOffset = offset2Id.getKey((long)PropertyIDMap.PID_CODEPAGE); + /* Look for the codepage. */ int codepage = -1; - if (cpOffset != -1) { + if (cpOffset != null) { /* Read the property's value type. It must be VT_I2. */ - long o = this.offset + cpOffset; - final long type = LittleEndian.getUInt(src, (int)o); - o += LittleEndian.INT_SIZE; + leis.setReadIndex((int)(this._offset + cpOffset)); + final long type = leis.readUInt(); if (type != Variant.VT_I2) { throw new HPSFRuntimeException - ("Value type of property ID 1 is not VT_I2 but " + - type + "."); + ("Value type of property ID 1 is not VT_I2 but " + type + "."); } /* Read the codepage number. */ - codepage = LittleEndian.getUShort(src, (int)o); + codepage = leis.readUShort(); } - + /* Pass 2: Read all properties - including the codepage property, * if available. */ for (Map.Entry<Long,Long> me : offset2Id.entrySet()) { long off = me.getKey(); long id = me.getValue(); - Property p; - if (id == PropertyIDMap.PID_CODEPAGE) { - p = new Property(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2, codepage); + + int pLen = propLen(offset2Id, off, size); + leis.setReadIndex((int)(this._offset + off)); + + if (id == PropertyIDMap.PID_DICTIONARY) { + leis.mark(100000); + if (!readDictionary(leis, pLen, codepage)) { + // there was an error reading the dictionary, maybe because the pid (0) was used wrong + // try reading a property instead + leis.reset(); + try { + // fix id + id = Math.max(PropertyIDMap.PID_MAX, offset2Id.inverseBidiMap().lastKey())+1; + setProperty(new MutableProperty(id, leis, pLen, codepage)); + } catch (RuntimeException e) { + LOG.log(POILogger.INFO, "Dictionary fallback failed - ignoring property"); + } + }; + } else if (id == PropertyIDMap.PID_CODEPAGE) { + setCodepage(codepage); } else { - int pLen = propLen(offset2Id, off, size); - long o = this.offset + off; - p = new Property(id, src, o, pLen, codepage); + setProperty(new MutableProperty(id, leis, pLen, codepage)); } - properties.put(id, p); } - - /* - * Extract the dictionary (if available). - */ - dictionary = (Map<Long,String>) getProperty(0); } /** @@ -291,7 +309,7 @@ public class Section { * @return The offset of the section in the stream. */ public long getOffset() { - return offset; + return _offset; } /** @@ -344,11 +362,10 @@ public class Section { * Sets the string value of the property with the specified ID. * * @param id The property's ID - * @param value The property's value. It will be written as a Unicode - * string. + * @param value The property's value. */ public void setProperty(final int id, final String value) { - setProperty(id, Variant.VT_LPWSTR, value); + setProperty(id, Variant.VT_LPSTR, value); } /** @@ -411,8 +428,9 @@ public class Section { * @see #getProperty * @see Variant */ + @SuppressWarnings("deprecation") public void setProperty(final int id, final long variantType, final Object value) { - setProperty(new Property(id, variantType, value)); + setProperty(new MutableProperty(id, variantType, value)); } @@ -591,11 +609,12 @@ public class Section { */ public String getPIDString(final long pid) { String s = null; - if (dictionary != null) { - s = dictionary.get(Long.valueOf(pid)); + Map<Long,String> dic = getDictionary(); + if (dic != null) { + s = dic.get(pid); } if (s == null) { - s = SectionIDMap.getPIDString(getFormatID().getBytes(), pid); + s = SectionIDMap.getPIDString(getFormatID(), pid); } return s; } @@ -615,19 +634,6 @@ public class Section { } /** - * Sets the codepage. - * - * @param codepage the codepage - */ - public void setCodepage(final int codepage) - { - setProperty(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2, - Integer.valueOf(codepage)); - } - - - - /** * Checks whether this section is equal to another object. The result is * {@code false} if one of the the following conditions holds: * @@ -651,8 +657,9 @@ public class Section { * @return {@code true} if the objects are equal, {@code false} if * not */ + @Override public boolean equals(final Object o) { - if (o == null || !(o instanceof Section)) { + if (!(o instanceof Section)) { return false; } final Section s = (Section) o; @@ -660,59 +667,26 @@ public class Section { return false; } - /* Compare all properties except 0 and 1 as they must be handled - * specially. */ - Property[] pa1 = new Property[getProperties().length]; - Property[] pa2 = new Property[s.getProperties().length]; - System.arraycopy(getProperties(), 0, pa1, 0, pa1.length); - System.arraycopy(s.getProperties(), 0, pa2, 0, pa2.length); - - /* Extract properties 0 and 1 and remove them from the copy of the - * arrays. */ - Property p10 = null; - Property p20 = null; - for (int i = 0; i < pa1.length; i++) { - final long id = pa1[i].getID(); - if (id == 0) { - p10 = pa1[i]; - pa1 = remove(pa1, i); - i--; - } - if (id == 1) { - pa1 = remove(pa1, i); - i--; + /* Compare all properties except the dictionary (id 0) and + * the codepage (id 1 / ignored) as they must be handled specially. */ + Set<Long> propIds = new HashSet<Long>(properties.keySet()); + propIds.addAll(s.properties.keySet()); + propIds.remove(0L); + propIds.remove(1L); + + for (Long id : propIds) { + Property p1 = properties.get(id); + Property p2 = s.properties.get(id); + if (p1 == null || p2 == null || !p1.equals(p2)) { + return false; } } - for (int i = 0; i < pa2.length; i++) { - final long id = pa2[i].getID(); - if (id == 0) { - p20 = pa2[i]; - pa2 = remove(pa2, i); - i--; - } - if (id == 1) { - pa2 = remove(pa2, i); - i--; - } - } - - /* If the number of properties (not counting property 1) is unequal the - * sections are unequal. */ - if (pa1.length != pa2.length) { - return false; - } /* If the dictionaries are unequal the sections are unequal. */ - boolean dictionaryEqual = true; - if (p10 != null && p20 != null) { - dictionaryEqual = p10.getValue().equals(p20.getValue()); - } else if (p10 != null || p20 != null) { - dictionaryEqual = false; - } - if (dictionaryEqual) { - return Util.equals(pa1, pa2); - } - return false; + Map<Long,String> d1 = getDictionary(); + Map<Long,String> d2 = s.getDictionary(); + + return (d1 == null && d2 == null) || (d1 != null && d2 != null && d1.equals(d2)); } /** @@ -725,22 +699,6 @@ public class Section { } /** - * Removes a field from a property array. The resulting array is - * compactified and returned. - * - * @param pa The property array. - * @param i The index of the field to be removed. - * @return the compactified array. - */ - private Property[] remove(final Property[] pa, final int i) { - final Property[] h = new Property[pa.length - 1]; - if (i > 0) { - System.arraycopy(pa, 0, h, 0, i); - } - System.arraycopy(pa, i + 1, h, i, h.length - i); - return h; - } - /** * Writes this section into an output stream.<p> * * Internally this is done by writing into three byte array output @@ -763,6 +721,17 @@ public class Section { return sectionBytes.length; } + /* Writing the section's dictionary it tricky. If there is a dictionary + * (property 0) the codepage property (property 1) must be set, too. */ + int codepage = getCodepage(); + if (codepage == -1) { + String msg = + "The codepage property is not set although a dictionary is present. "+ + "Defaulting to ISO-8859-1."; + LOG.log(POILogger.WARN, msg); + codepage = Property.DEFAULT_CODEPAGE; + } + /* The properties are written to this stream. */ final ByteArrayOutputStream propertyStream = new ByteArrayOutputStream(); @@ -777,79 +746,120 @@ public class Section { /* Increase the position variable by the size of the property list so * that it points behind the property list and to the beginning of the * properties themselves. */ - position += 2 * LittleEndian.INT_SIZE + getPropertyCount() * 2 * LittleEndian.INT_SIZE; + position += 2 * LittleEndianConsts.INT_SIZE + getPropertyCount() * 2 * LittleEndianConsts.INT_SIZE; - /* Writing the section's dictionary it tricky. If there is a dictionary - * (property 0) the codepage property (property 1) must be set, too. */ - int codepage = -1; - if (getProperty(PropertyIDMap.PID_DICTIONARY) != null) { - final Object p1 = getProperty(PropertyIDMap.PID_CODEPAGE); - if (p1 != null) { - if (!(p1 instanceof Integer)) { - throw new IllegalPropertySetDataException - ("The codepage property (ID = 1) must be an " + - "Integer object."); - } - } else { - /* Warning: The codepage property is not set although a - * dictionary is present. In order to cope with this problem we - * add the codepage property and set it to Unicode. */ - setProperty(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2, - Integer.valueOf(CodePageUtil.CP_UNICODE)); - } - codepage = getCodepage(); - } - /* Write the properties and the property list into their respective * streams: */ for (Property p : properties.values()) { final long id = p.getID(); /* Write the property list entry. */ - TypeWriter.writeUIntToStream(propertyListStream, p.getID()); - TypeWriter.writeUIntToStream(propertyListStream, position); + LittleEndian.putUInt(id, propertyListStream); + LittleEndian.putUInt(position, propertyListStream); /* If the property ID is not equal 0 we write the property and all * is fine. However, if it equals 0 we have to write the section's * dictionary which has an implicit type only and an explicit * value. */ - if (id != 0) + if (id != 0) { /* Write the property and update the position to the next * property. */ - position += p.write(propertyStream, getCodepage()); - else - { - if (codepage == -1) - throw new IllegalPropertySetDataException - ("Codepage (property 1) is undefined."); - position += writeDictionary(propertyStream, dictionary, - codepage); + position += p.write(propertyStream, codepage); + } else { + if (codepage == -1) { + throw new IllegalPropertySetDataException("Codepage (property 1) is undefined."); + } + position += writeDictionary(propertyStream, codepage); } } - propertyStream.close(); - propertyListStream.close(); /* Write the section: */ - byte[] pb1 = propertyListStream.toByteArray(); - byte[] pb2 = propertyStream.toByteArray(); + int streamLength = LittleEndianConsts.INT_SIZE * 2 + propertyListStream.size() + propertyStream.size(); /* Write the section's length: */ - TypeWriter.writeToStream(out, LittleEndian.INT_SIZE * 2 + - pb1.length + pb2.length); + LittleEndian.putInt(streamLength, out); /* Write the section's number of properties: */ - TypeWriter.writeToStream(out, getPropertyCount()); + LittleEndian.putInt(getPropertyCount(), out); /* Write the property list: */ - out.write(pb1); + propertyListStream.writeTo(out); /* Write the properties: */ - out.write(pb2); + propertyStream.writeTo(out); - int streamLength = LittleEndian.INT_SIZE * 2 + pb1.length + pb2.length; return streamLength; } + /** + * Reads a dictionary. + * + * @param leis The byte stream containing the bytes making out the dictionary. + * @param length The dictionary contains at most this many bytes. + * @param codepage The codepage of the string values. + * + * @return {@code true} if dictionary was read successful, {@code false} otherwise + * + * @throws UnsupportedEncodingException if the dictionary's codepage is not + * (yet) supported. + */ + private boolean readDictionary(LittleEndianByteArrayInputStream leis, final int length, final int codepage) + throws UnsupportedEncodingException { + Map<Long,String> dic = new HashMap<Long,String>(); + + /* + * Read the number of dictionary entries. + */ + final long nrEntries = leis.readUInt(); + + long id = -1; + boolean isCorrupted = false; + for (int i = 0; i < nrEntries; i++) { + String errMsg = + "The property set's dictionary contains bogus data. " + + "All dictionary entries starting with the one with ID " + + id + " will be ignored."; + + /* The key. */ + id = leis.readUInt(); + + /* The value (a string). The length is the either the + * number of (two-byte) characters if the character set is Unicode + * or the number of bytes if the character set is not Unicode. + * The length includes terminating 0x00 bytes which we have to strip + * off to create a Java string. */ + long sLength = leis.readUInt(); + + /* Read the string - Strip 0x00 characters from the end of the string. */ + int cp = (codepage == -1) ? Property.DEFAULT_CODEPAGE : codepage; + int nrBytes = (int)((sLength-1) * (cp == CodePageUtil.CP_UNICODE ? 2 : 1)); + if (nrBytes > 0xFFFFFF) { + LOG.log(POILogger.WARN, errMsg); + isCorrupted = true; + break; + } + + try { + byte buf[] = new byte[nrBytes]; + leis.readFully(buf, 0, nrBytes); + final String str = CodePageUtil.getStringFromCodePage(buf, 0, nrBytes, cp); + + int pad = 1; + if (cp == CodePageUtil.CP_UNICODE) { + pad = 2+((4 - ((nrBytes+2) & 0x3)) & 0x3); + } + leis.skip(pad); + + dic.put(id, str); + } catch (RuntimeException ex) { + LOG.log(POILogger.WARN, errMsg, ex); + isCorrupted = true; + break; + } + } + setDictionary(dic); + return !isCorrupted; + } /** @@ -861,48 +871,37 @@ public class Section { * @return The number of bytes written * @exception IOException if an I/O exception occurs. */ - private static int writeDictionary(final OutputStream out, final Map<Long,String> dictionary, final int codepage) + private int writeDictionary(final OutputStream out, final int codepage) throws IOException { - int length = TypeWriter.writeUIntToStream(out, dictionary.size()); - for (Map.Entry<Long,String> ls : dictionary.entrySet()) { - final Long key = ls.getKey(); - final String value = ls.getValue(); + final byte padding[] = new byte[4]; + Map<Long,String> dic = getDictionary(); + + LittleEndian.putUInt(dic.size(), out); + int length = LittleEndianConsts.INT_SIZE; + for (Map.Entry<Long,String> ls : dic.entrySet()) { + + LittleEndian.putUInt(ls.getKey(), out); + length += LittleEndianConsts.INT_SIZE; + + String value = ls.getValue()+"\0"; + LittleEndian.putUInt( value.length(), out ); + length += LittleEndianConsts.INT_SIZE; + + byte bytes[] = CodePageUtil.getBytesInCodePage(value, codepage); + out.write(bytes); + length += bytes.length; if (codepage == CodePageUtil.CP_UNICODE) { - /* Write the dictionary item in Unicode. */ - int sLength = value.length() + 1; - if ((sLength & 1) == 1) { - sLength++; - } - length += TypeWriter.writeUIntToStream(out, key.longValue()); - length += TypeWriter.writeUIntToStream(out, sLength); - final byte[] ca = CodePageUtil.getBytesInCodePage(value, codepage); - for (int j = 2; j < ca.length; j += 2) { - out.write(ca[j+1]); - out.write(ca[j]); - length += 2; - } - sLength -= value.length(); - while (sLength > 0) { - out.write(0x00); - out.write(0x00); - length += 2; - sLength--; - } - } else { - /* Write the dictionary item in another codepage than - * Unicode. */ - length += TypeWriter.writeUIntToStream(out, key.longValue()); - length += TypeWriter.writeUIntToStream(out, value.length() + 1L); - final byte[] ba = CodePageUtil.getBytesInCodePage(value, codepage); - for (int j = 0; j < ba.length; j++) { - out.write(ba[j]); - length++; - } - out.write(0x00); - length++; + int pad = (4 - (length & 0x3)) & 0x3; + out.write(padding, 0, pad); + length += pad; } } + + int pad = (4 - (length & 0x3)) & 0x3; + out.write(padding, 0, pad); + length += pad; + return length; } @@ -924,25 +923,27 @@ public class Section { */ public void setDictionary(final Map<Long,String> dictionary) throws IllegalPropertySetDataException { if (dictionary != null) { - this.dictionary = dictionary; + if (this.dictionary == null) { + this.dictionary = new TreeMap<Long,String>(); + } + this.dictionary.putAll(dictionary); + + /* If the codepage property (ID 1) for the strings (keys and values) + * used in the dictionary is not yet defined, set it to ISO-8859-1. */ + int cp = getCodepage(); + if (cp == -1) { + setCodepage(Property.DEFAULT_CODEPAGE); + } /* Set the dictionary property (ID 0). Please note that the second * parameter in the method call below is unused because dictionaries * don't have a type. */ setProperty(PropertyIDMap.PID_DICTIONARY, -1, dictionary); - - /* If the codepage property (ID 1) for the strings (keys and - * values) used in the dictionary is not yet defined, set it to - * Unicode. */ - final Integer codepage = (Integer) getProperty(PropertyIDMap.PID_CODEPAGE); - if (codepage == null) { - setProperty(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2, - Integer.valueOf(CodePageUtil.CP_UNICODE)); - } } else { /* Setting the dictionary to null means to remove property 0. * However, it does not mean to remove property 1 (codepage). */ removeProperty(PropertyIDMap.PID_DICTIONARY); + this.dictionary = null; } } @@ -951,6 +952,7 @@ public class Section { /** * @see Object#hashCode() */ + @Override public int hashCode() { long hashCode = 0; hashCode += getFormatID().hashCode(); @@ -967,9 +969,11 @@ public class Section { /** * @see Object#toString() */ + @Override public String toString() { final StringBuffer b = new StringBuffer(); final Property[] pa = getProperties(); + b.append("\n\n\n"); b.append(getClass().getName()); b.append('['); b.append("formatID: "); @@ -981,8 +985,12 @@ public class Section { b.append(", size: "); b.append(getSize()); b.append(", properties: [\n"); - for (int i = 0; i < pa.length; i++) { - b.append(pa[i]); + int codepage = getCodepage(); + if (codepage == -1) { + codepage = Property.DEFAULT_CODEPAGE; + } + for (Property p : pa) { + b.append(p.toString(codepage)); b.append(",\n"); } b.append(']'); @@ -1002,7 +1010,12 @@ public class Section { * @return the dictionary or {@code null} if the section does not have * a dictionary. */ + @SuppressWarnings("unchecked") public Map<Long,String> getDictionary() { + if (dictionary == null) { + dictionary = (Map<Long,String>) getProperty(PropertyIDMap.PID_DICTIONARY); + } + return dictionary; } @@ -1013,13 +1026,17 @@ public class Section { * * @return The section's codepage if one is defined, else -1. */ - public int getCodepage() - { + public int getCodepage() { final Integer codepage = (Integer) getProperty(PropertyIDMap.PID_CODEPAGE); - if (codepage == null) { - return -1; - } - int cp = codepage.intValue(); - return cp; + return (codepage == null) ? -1 : codepage.intValue(); + } + + /** + * Sets the codepage. + * + * @param codepage the codepage + */ + public void setCodepage(final int codepage) { + setProperty(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2, codepage); } } diff --git a/test-data/hpsf/TestBug52372.doc b/test-data/hpsf/TestBug52372.doc Binary files differnew file mode 100644 index 0000000000..ddaaf5ee9e --- /dev/null +++ b/test-data/hpsf/TestBug52372.doc |