Browse Source

#52372 - OutOfMemoryError parsing a word file

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1793602 13f79535-47bb-0310-9956-ffa450edef68
tags/REL_3_17_BETA1
Andreas Beeker 7 years ago
parent
commit
24f2dcf6cb
2 changed files with 245 additions and 228 deletions
  1. 245
    228
      src/java/org/apache/poi/hpsf/Section.java
  2. BIN
      test-data/hpsf/TestBug52372.doc

+ 245
- 228
src/java/org/apache/poi/hpsf/Section.java View File

@@ -22,7 +22,11 @@ import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.collections4.bidimap.TreeBidiMap;
@@ -30,11 +34,16 @@ import org.apache.poi.hpsf.wellknown.PropertyIDMap;
import org.apache.poi.hpsf.wellknown.SectionIDMap;
import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.LittleEndianByteArrayInputStream;
import org.apache.poi.util.LittleEndianConsts;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;

/**
* Represents a section in a {@link PropertySet}.
*/
public class Section {
private static final POILogger LOG = POILogFactory.getLogger(Section.class);

/**
* Maps property IDs to section-private PID strings. These
@@ -62,7 +71,7 @@ public class Section {
/**
* The offset of the section in the stream.
*/
private long offset = -1;
private final long _offset;

/**
* The section's size in bytes.
@@ -72,7 +81,7 @@ public class Section {
/**
* This section's properties.
*/
private final Map<Long,Property> properties = new TreeMap<Long,Property>();
private final Map<Long,Property> properties = new LinkedHashMap<Long,Property>();

/**
* This member is {@code true} if the last call to {@link
@@ -85,6 +94,7 @@ public class Section {
* Creates an empty {@link Section}.
*/
public Section() {
this._offset = -1;
}

/**
@@ -96,6 +106,7 @@ public class Section {
* @param s The section set to copy
*/
public Section(final Section s) {
this._offset = -1;
setFormatID(s.getFormatID());
for (Property p : s.properties.values()) {
properties.put(p.getID(), new MutableProperty(p));
@@ -117,32 +128,38 @@ public class Section {
*/
@SuppressWarnings("unchecked")
public Section(final byte[] src, final int offset) throws UnsupportedEncodingException {
int o1 = offset;

/*
* Read the format ID.
*/
formatID = new ClassID(src, o1);
o1 += ClassID.LENGTH;
formatID = new ClassID(src, offset);

/*
* Read the offset from the stream's start and positions to
* the section header.
*/
this.offset = LittleEndian.getUInt(src, o1);
o1 = (int) this.offset;
int offFix = (int)LittleEndian.getUInt(src, offset + ClassID.LENGTH);

// some input files have a invalid (padded?) offset, which need to be fixed
// search for beginning of size field
if (src[offFix] == 0) {
for (int i=0; i<3 && src[offFix] == 0; i++,offFix++);
// cross check with propertyCount field and the property list field
for (int i=0; i<3 && (src[offFix+3] != 0 || src[offFix+7] != 0 || src[offFix+11] != 0); i++,offFix--);
}

this._offset = offFix;

LittleEndianByteArrayInputStream leis = new LittleEndianByteArrayInputStream(src, offFix);

/*
* Read the section length.
*/
size = (int) LittleEndian.getUInt(src, o1);
o1 += LittleEndian.INT_SIZE;
size = (int)leis.readUInt();

/*
* Read the number of properties.
*/
final int propertyCount = (int) LittleEndian.getUInt(src, o1);
o1 += LittleEndian.INT_SIZE;
final int propertyCount = (int)leis.readUInt();

/*
* Read the properties. The offset is positioned at the first
@@ -169,64 +186,65 @@ public class Section {
* seconds pass reads the other properties.
*/
/* Pass 1: Read the property list. */
int pass1Offset = o1;
long cpOffset = -1;
final TreeBidiMap<Long,Long> offset2Id = new TreeBidiMap<Long,Long>();
for (int i = 0; i < propertyCount; i++) {
/* Read the property ID. */
long id = LittleEndian.getUInt(src, pass1Offset);
pass1Offset += LittleEndian.INT_SIZE;
long id = (int)leis.readUInt();

/* Offset from the section's start. */
long off = LittleEndian.getUInt(src, pass1Offset);
pass1Offset += LittleEndian.INT_SIZE;
long off = (int)leis.readUInt();

offset2Id.put(off, id);
if (id == PropertyIDMap.PID_CODEPAGE) {
cpOffset = off;
}
}

Long cpOffset = offset2Id.getKey((long)PropertyIDMap.PID_CODEPAGE);

/* Look for the codepage. */
int codepage = -1;
if (cpOffset != -1) {
if (cpOffset != null) {
/* Read the property's value type. It must be VT_I2. */
long o = this.offset + cpOffset;
final long type = LittleEndian.getUInt(src, (int)o);
o += LittleEndian.INT_SIZE;
leis.setReadIndex((int)(this._offset + cpOffset));
final long type = leis.readUInt();

if (type != Variant.VT_I2) {
throw new HPSFRuntimeException
("Value type of property ID 1 is not VT_I2 but " +
type + ".");
("Value type of property ID 1 is not VT_I2 but " + type + ".");
}

/* Read the codepage number. */
codepage = LittleEndian.getUShort(src, (int)o);
codepage = leis.readUShort();
}

/* Pass 2: Read all properties - including the codepage property,
* if available. */
for (Map.Entry<Long,Long> me : offset2Id.entrySet()) {
long off = me.getKey();
long id = me.getValue();
Property p;
if (id == PropertyIDMap.PID_CODEPAGE) {
p = new Property(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2, codepage);

int pLen = propLen(offset2Id, off, size);
leis.setReadIndex((int)(this._offset + off));

if (id == PropertyIDMap.PID_DICTIONARY) {
leis.mark(100000);
if (!readDictionary(leis, pLen, codepage)) {
// there was an error reading the dictionary, maybe because the pid (0) was used wrong
// try reading a property instead
leis.reset();
try {
// fix id
id = Math.max(PropertyIDMap.PID_MAX, offset2Id.inverseBidiMap().lastKey())+1;
setProperty(new MutableProperty(id, leis, pLen, codepage));
} catch (RuntimeException e) {
LOG.log(POILogger.INFO, "Dictionary fallback failed - ignoring property");
}
};
} else if (id == PropertyIDMap.PID_CODEPAGE) {
setCodepage(codepage);
} else {
int pLen = propLen(offset2Id, off, size);
long o = this.offset + off;
p = new Property(id, src, o, pLen, codepage);
setProperty(new MutableProperty(id, leis, pLen, codepage));
}
properties.put(id, p);
}

/*
* Extract the dictionary (if available).
*/
dictionary = (Map<Long,String>) getProperty(0);
}

/**
@@ -291,7 +309,7 @@ public class Section {
* @return The offset of the section in the stream.
*/
public long getOffset() {
return offset;
return _offset;
}

/**
@@ -344,11 +362,10 @@ public class Section {
* Sets the string value of the property with the specified ID.
*
* @param id The property's ID
* @param value The property's value. It will be written as a Unicode
* string.
* @param value The property's value.
*/
public void setProperty(final int id, final String value) {
setProperty(id, Variant.VT_LPWSTR, value);
setProperty(id, Variant.VT_LPSTR, value);
}

/**
@@ -411,8 +428,9 @@ public class Section {
* @see #getProperty
* @see Variant
*/
@SuppressWarnings("deprecation")
public void setProperty(final int id, final long variantType, final Object value) {
setProperty(new Property(id, variantType, value));
setProperty(new MutableProperty(id, variantType, value));
}


@@ -591,11 +609,12 @@ public class Section {
*/
public String getPIDString(final long pid) {
String s = null;
if (dictionary != null) {
s = dictionary.get(Long.valueOf(pid));
Map<Long,String> dic = getDictionary();
if (dic != null) {
s = dic.get(pid);
}
if (s == null) {
s = SectionIDMap.getPIDString(getFormatID().getBytes(), pid);
s = SectionIDMap.getPIDString(getFormatID(), pid);
}
return s;
}
@@ -614,19 +633,6 @@ public class Section {
}
}

/**
* Sets the codepage.
*
* @param codepage the codepage
*/
public void setCodepage(final int codepage)
{
setProperty(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2,
Integer.valueOf(codepage));
}



/**
* Checks whether this section is equal to another object. The result is
* {@code false} if one of the the following conditions holds:
@@ -651,8 +657,9 @@ public class Section {
* @return {@code true} if the objects are equal, {@code false} if
* not
*/
@Override
public boolean equals(final Object o) {
if (o == null || !(o instanceof Section)) {
if (!(o instanceof Section)) {
return false;
}
final Section s = (Section) o;
@@ -660,59 +667,26 @@ public class Section {
return false;
}

/* Compare all properties except 0 and 1 as they must be handled
* specially. */
Property[] pa1 = new Property[getProperties().length];
Property[] pa2 = new Property[s.getProperties().length];
System.arraycopy(getProperties(), 0, pa1, 0, pa1.length);
System.arraycopy(s.getProperties(), 0, pa2, 0, pa2.length);

/* Extract properties 0 and 1 and remove them from the copy of the
* arrays. */
Property p10 = null;
Property p20 = null;
for (int i = 0; i < pa1.length; i++) {
final long id = pa1[i].getID();
if (id == 0) {
p10 = pa1[i];
pa1 = remove(pa1, i);
i--;
}
if (id == 1) {
pa1 = remove(pa1, i);
i--;
/* Compare all properties except the dictionary (id 0) and
* the codepage (id 1 / ignored) as they must be handled specially. */
Set<Long> propIds = new HashSet<Long>(properties.keySet());
propIds.addAll(s.properties.keySet());
propIds.remove(0L);
propIds.remove(1L);

for (Long id : propIds) {
Property p1 = properties.get(id);
Property p2 = s.properties.get(id);
if (p1 == null || p2 == null || !p1.equals(p2)) {
return false;
}
}
for (int i = 0; i < pa2.length; i++) {
final long id = pa2[i].getID();
if (id == 0) {
p20 = pa2[i];
pa2 = remove(pa2, i);
i--;
}
if (id == 1) {
pa2 = remove(pa2, i);
i--;
}
}

/* If the number of properties (not counting property 1) is unequal the
* sections are unequal. */
if (pa1.length != pa2.length) {
return false;
}

/* If the dictionaries are unequal the sections are unequal. */
boolean dictionaryEqual = true;
if (p10 != null && p20 != null) {
dictionaryEqual = p10.getValue().equals(p20.getValue());
} else if (p10 != null || p20 != null) {
dictionaryEqual = false;
}
if (dictionaryEqual) {
return Util.equals(pa1, pa2);
}
return false;
Map<Long,String> d1 = getDictionary();
Map<Long,String> d2 = s.getDictionary();

return (d1 == null && d2 == null) || (d1 != null && d2 != null && d1.equals(d2));
}

/**
@@ -724,22 +698,6 @@ public class Section {
dirty |= (properties.remove(id) != null);
}

/**
* Removes a field from a property array. The resulting array is
* compactified and returned.
*
* @param pa The property array.
* @param i The index of the field to be removed.
* @return the compactified array.
*/
private Property[] remove(final Property[] pa, final int i) {
final Property[] h = new Property[pa.length - 1];
if (i > 0) {
System.arraycopy(pa, 0, h, 0, i);
}
System.arraycopy(pa, i + 1, h, i, h.length - i);
return h;
}
/**
* Writes this section into an output stream.<p>
*
@@ -763,6 +721,17 @@ public class Section {
return sectionBytes.length;
}

/* Writing the section's dictionary it tricky. If there is a dictionary
* (property 0) the codepage property (property 1) must be set, too. */
int codepage = getCodepage();
if (codepage == -1) {
String msg =
"The codepage property is not set although a dictionary is present. "+
"Defaulting to ISO-8859-1.";
LOG.log(POILogger.WARN, msg);
codepage = Property.DEFAULT_CODEPAGE;
}

/* The properties are written to this stream. */
final ByteArrayOutputStream propertyStream = new ByteArrayOutputStream();

@@ -777,79 +746,120 @@ public class Section {
/* Increase the position variable by the size of the property list so
* that it points behind the property list and to the beginning of the
* properties themselves. */
position += 2 * LittleEndian.INT_SIZE + getPropertyCount() * 2 * LittleEndian.INT_SIZE;
position += 2 * LittleEndianConsts.INT_SIZE + getPropertyCount() * 2 * LittleEndianConsts.INT_SIZE;

/* Writing the section's dictionary it tricky. If there is a dictionary
* (property 0) the codepage property (property 1) must be set, too. */
int codepage = -1;
if (getProperty(PropertyIDMap.PID_DICTIONARY) != null) {
final Object p1 = getProperty(PropertyIDMap.PID_CODEPAGE);
if (p1 != null) {
if (!(p1 instanceof Integer)) {
throw new IllegalPropertySetDataException
("The codepage property (ID = 1) must be an " +
"Integer object.");
}
} else {
/* Warning: The codepage property is not set although a
* dictionary is present. In order to cope with this problem we
* add the codepage property and set it to Unicode. */
setProperty(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2,
Integer.valueOf(CodePageUtil.CP_UNICODE));
}
codepage = getCodepage();
}
/* Write the properties and the property list into their respective
* streams: */
for (Property p : properties.values()) {
final long id = p.getID();

/* Write the property list entry. */
TypeWriter.writeUIntToStream(propertyListStream, p.getID());
TypeWriter.writeUIntToStream(propertyListStream, position);
LittleEndian.putUInt(id, propertyListStream);
LittleEndian.putUInt(position, propertyListStream);

/* If the property ID is not equal 0 we write the property and all
* is fine. However, if it equals 0 we have to write the section's
* dictionary which has an implicit type only and an explicit
* value. */
if (id != 0)
if (id != 0) {
/* Write the property and update the position to the next
* property. */
position += p.write(propertyStream, getCodepage());
else
{
if (codepage == -1)
throw new IllegalPropertySetDataException
("Codepage (property 1) is undefined.");
position += writeDictionary(propertyStream, dictionary,
codepage);
position += p.write(propertyStream, codepage);
} else {
if (codepage == -1) {
throw new IllegalPropertySetDataException("Codepage (property 1) is undefined.");
}
position += writeDictionary(propertyStream, codepage);
}
}
propertyStream.close();
propertyListStream.close();

/* Write the section: */
byte[] pb1 = propertyListStream.toByteArray();
byte[] pb2 = propertyStream.toByteArray();
int streamLength = LittleEndianConsts.INT_SIZE * 2 + propertyListStream.size() + propertyStream.size();

/* Write the section's length: */
TypeWriter.writeToStream(out, LittleEndian.INT_SIZE * 2 +
pb1.length + pb2.length);
LittleEndian.putInt(streamLength, out);

/* Write the section's number of properties: */
TypeWriter.writeToStream(out, getPropertyCount());
LittleEndian.putInt(getPropertyCount(), out);

/* Write the property list: */
out.write(pb1);
propertyListStream.writeTo(out);

/* Write the properties: */
out.write(pb2);
propertyStream.writeTo(out);

int streamLength = LittleEndian.INT_SIZE * 2 + pb1.length + pb2.length;
return streamLength;
}

/**
* Reads a dictionary.
*
* @param leis The byte stream containing the bytes making out the dictionary.
* @param length The dictionary contains at most this many bytes.
* @param codepage The codepage of the string values.
*
* @return {@code true} if dictionary was read successful, {@code false} otherwise
*
* @throws UnsupportedEncodingException if the dictionary's codepage is not
* (yet) supported.
*/
private boolean readDictionary(LittleEndianByteArrayInputStream leis, final int length, final int codepage)
throws UnsupportedEncodingException {
Map<Long,String> dic = new HashMap<Long,String>();

/*
* Read the number of dictionary entries.
*/
final long nrEntries = leis.readUInt();

long id = -1;
boolean isCorrupted = false;
for (int i = 0; i < nrEntries; i++) {
String errMsg =
"The property set's dictionary contains bogus data. "
+ "All dictionary entries starting with the one with ID "
+ id + " will be ignored.";

/* The key. */
id = leis.readUInt();

/* The value (a string). The length is the either the
* number of (two-byte) characters if the character set is Unicode
* or the number of bytes if the character set is not Unicode.
* The length includes terminating 0x00 bytes which we have to strip
* off to create a Java string. */
long sLength = leis.readUInt();

/* Read the string - Strip 0x00 characters from the end of the string. */
int cp = (codepage == -1) ? Property.DEFAULT_CODEPAGE : codepage;
int nrBytes = (int)((sLength-1) * (cp == CodePageUtil.CP_UNICODE ? 2 : 1));
if (nrBytes > 0xFFFFFF) {
LOG.log(POILogger.WARN, errMsg);
isCorrupted = true;
break;
}

try {
byte buf[] = new byte[nrBytes];
leis.readFully(buf, 0, nrBytes);
final String str = CodePageUtil.getStringFromCodePage(buf, 0, nrBytes, cp);

int pad = 1;
if (cp == CodePageUtil.CP_UNICODE) {
pad = 2+((4 - ((nrBytes+2) & 0x3)) & 0x3);
}
leis.skip(pad);

dic.put(id, str);
} catch (RuntimeException ex) {
LOG.log(POILogger.WARN, errMsg, ex);
isCorrupted = true;
break;
}
}
setDictionary(dic);
return !isCorrupted;
}


/**
@@ -861,48 +871,37 @@ public class Section {
* @return The number of bytes written
* @exception IOException if an I/O exception occurs.
*/
private static int writeDictionary(final OutputStream out, final Map<Long,String> dictionary, final int codepage)
private int writeDictionary(final OutputStream out, final int codepage)
throws IOException {
int length = TypeWriter.writeUIntToStream(out, dictionary.size());
for (Map.Entry<Long,String> ls : dictionary.entrySet()) {
final Long key = ls.getKey();
final String value = ls.getValue();
final byte padding[] = new byte[4];
Map<Long,String> dic = getDictionary();

LittleEndian.putUInt(dic.size(), out);
int length = LittleEndianConsts.INT_SIZE;
for (Map.Entry<Long,String> ls : dic.entrySet()) {

LittleEndian.putUInt(ls.getKey(), out);
length += LittleEndianConsts.INT_SIZE;

String value = ls.getValue()+"\0";
LittleEndian.putUInt( value.length(), out );
length += LittleEndianConsts.INT_SIZE;

byte bytes[] = CodePageUtil.getBytesInCodePage(value, codepage);
out.write(bytes);
length += bytes.length;

if (codepage == CodePageUtil.CP_UNICODE) {
/* Write the dictionary item in Unicode. */
int sLength = value.length() + 1;
if ((sLength & 1) == 1) {
sLength++;
}
length += TypeWriter.writeUIntToStream(out, key.longValue());
length += TypeWriter.writeUIntToStream(out, sLength);
final byte[] ca = CodePageUtil.getBytesInCodePage(value, codepage);
for (int j = 2; j < ca.length; j += 2) {
out.write(ca[j+1]);
out.write(ca[j]);
length += 2;
}
sLength -= value.length();
while (sLength > 0) {
out.write(0x00);
out.write(0x00);
length += 2;
sLength--;
}
} else {
/* Write the dictionary item in another codepage than
* Unicode. */
length += TypeWriter.writeUIntToStream(out, key.longValue());
length += TypeWriter.writeUIntToStream(out, value.length() + 1L);
final byte[] ba = CodePageUtil.getBytesInCodePage(value, codepage);
for (int j = 0; j < ba.length; j++) {
out.write(ba[j]);
length++;
}
out.write(0x00);
length++;
int pad = (4 - (length & 0x3)) & 0x3;
out.write(padding, 0, pad);
length += pad;
}
}

int pad = (4 - (length & 0x3)) & 0x3;
out.write(padding, 0, pad);
length += pad;

return length;
}

@@ -924,25 +923,27 @@ public class Section {
*/
public void setDictionary(final Map<Long,String> dictionary) throws IllegalPropertySetDataException {
if (dictionary != null) {
this.dictionary = dictionary;
if (this.dictionary == null) {
this.dictionary = new TreeMap<Long,String>();
}
this.dictionary.putAll(dictionary);

/* If the codepage property (ID 1) for the strings (keys and values)
* used in the dictionary is not yet defined, set it to ISO-8859-1. */
int cp = getCodepage();
if (cp == -1) {
setCodepage(Property.DEFAULT_CODEPAGE);
}

/* Set the dictionary property (ID 0). Please note that the second
* parameter in the method call below is unused because dictionaries
* don't have a type. */
setProperty(PropertyIDMap.PID_DICTIONARY, -1, dictionary);

/* If the codepage property (ID 1) for the strings (keys and
* values) used in the dictionary is not yet defined, set it to
* Unicode. */
final Integer codepage = (Integer) getProperty(PropertyIDMap.PID_CODEPAGE);
if (codepage == null) {
setProperty(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2,
Integer.valueOf(CodePageUtil.CP_UNICODE));
}
} else {
/* Setting the dictionary to null means to remove property 0.
* However, it does not mean to remove property 1 (codepage). */
removeProperty(PropertyIDMap.PID_DICTIONARY);
this.dictionary = null;
}
}

@@ -951,6 +952,7 @@ public class Section {
/**
* @see Object#hashCode()
*/
@Override
public int hashCode() {
long hashCode = 0;
hashCode += getFormatID().hashCode();
@@ -967,9 +969,11 @@ public class Section {
/**
* @see Object#toString()
*/
@Override
public String toString() {
final StringBuffer b = new StringBuffer();
final Property[] pa = getProperties();
b.append("\n\n\n");
b.append(getClass().getName());
b.append('[');
b.append("formatID: ");
@@ -981,8 +985,12 @@ public class Section {
b.append(", size: ");
b.append(getSize());
b.append(", properties: [\n");
for (int i = 0; i < pa.length; i++) {
b.append(pa[i]);
int codepage = getCodepage();
if (codepage == -1) {
codepage = Property.DEFAULT_CODEPAGE;
}
for (Property p : pa) {
b.append(p.toString(codepage));
b.append(",\n");
}
b.append(']');
@@ -1002,7 +1010,12 @@ public class Section {
* @return the dictionary or {@code null} if the section does not have
* a dictionary.
*/
@SuppressWarnings("unchecked")
public Map<Long,String> getDictionary() {
if (dictionary == null) {
dictionary = (Map<Long,String>) getProperty(PropertyIDMap.PID_DICTIONARY);
}

return dictionary;
}

@@ -1013,13 +1026,17 @@ public class Section {
*
* @return The section's codepage if one is defined, else -1.
*/
public int getCodepage()
{
public int getCodepage() {
final Integer codepage = (Integer) getProperty(PropertyIDMap.PID_CODEPAGE);
if (codepage == null) {
return -1;
}
int cp = codepage.intValue();
return cp;
return (codepage == null) ? -1 : codepage.intValue();
}

/**
* Sets the codepage.
*
* @param codepage the codepage
*/
public void setCodepage(final int codepage) {
setProperty(PropertyIDMap.PID_CODEPAGE, Variant.VT_I2, codepage);
}
}

BIN
test-data/hpsf/TestBug52372.doc View File


Loading…
Cancel
Save