diff options
author | Marius Volkhart <mariusvolkhart@apache.org> | 2021-02-28 18:49:42 +0000 |
---|---|---|
committer | Marius Volkhart <mariusvolkhart@apache.org> | 2021-02-28 18:49:42 +0000 |
commit | 9c88bb220136cdeb1355c6f98d788271d701f718 (patch) | |
tree | bbffb733b721b56c9a7347c57c583f9867b9b4a8 /src/scratchpad | |
parent | fab0ec3e088fbc113fe91bcd36877d966688f2d9 (diff) | |
download | poi-9c88bb220136cdeb1355c6f98d788271d701f718.tar.gz poi-9c88bb220136cdeb1355c6f98d788271d701f718.zip |
Rework EscherRecordHolder parsing
Modify the parsing done by EscherRecordHolder to be more deterministic. The format of the OfficeArtContent structure, which the EscherRecordHolder represents, is well defined in the MS-DOC spec. A clear class structure makes it easier to reason about the availability of data.
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1887008 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/scratchpad')
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java | 192 |
1 files changed, 86 insertions, 106 deletions
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java b/src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java index ccc0a7e42c..8d5c392860 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java @@ -18,15 +18,18 @@ package org.apache.poi.hwpf.model; import java.util.ArrayList; -import java.util.Iterator; import java.util.List; +import org.apache.logging.log4j.LogManager; import org.apache.poi.ddf.DefaultEscherRecordFactory; import org.apache.poi.ddf.EscherContainerRecord; import org.apache.poi.ddf.EscherRecord; import org.apache.poi.ddf.EscherRecordFactory; +import org.apache.poi.ddf.EscherRecordTypes; import org.apache.poi.util.Internal; +import static org.apache.logging.log4j.util.Unbox.box; + /** * Based on AbstractEscherRecordHolder from HSSF. * @@ -34,7 +37,27 @@ import org.apache.poi.util.Internal; */ @Internal public final class EscherRecordHolder { - private final ArrayList<EscherRecord> escherRecords = new ArrayList<>(); + + /** + * {@link EscherRecordTypes#DGG_CONTAINER} containing drawing group information for the document. + */ + private final EscherContainerRecord drawingGroupData = new EscherContainerRecord(); + + /** + * {@link EscherRecordTypes#DG_CONTAINER} for drawings in the Main Document. + * <p> + * {@code null} to indicate that the document does not have a {@link EscherRecordTypes#DG_CONTAINER} for the Main + * Document. + */ + private EscherContainerRecord mainDocumentDgContainer; + + /** + * {@link EscherRecordTypes#DG_CONTAINER} for drawings in the Header Document. + * <p> + * {@code null} to indicate that the document does not have a {@link EscherRecordTypes#DG_CONTAINER} for the Header + * Document. + */ + private EscherContainerRecord headerDocumentDgContainer; public EscherRecordHolder(byte[] data, int offset, int size) { fillEscherRecords(data, offset, size); @@ -47,129 +70,77 @@ public final class EscherRecordHolder { * * @see FileInformationBlock#getLcbDggInfo() */ - private void fillEscherRecords(byte[] data, int offset, int size) - { + private void fillEscherRecords(byte[] data, int offset, int size) { if (size == 0) return; EscherRecordFactory recordFactory = new DefaultEscherRecordFactory(); int pos = offset; - while ( pos < offset + size) - { - EscherRecord r = recordFactory.createRecord(data, pos); - escherRecords.add(r); - int bytesRead = r.fillFields(data, pos, recordFactory); - pos += bytesRead + 1; // There is an empty byte between each top-level record in a Word doc - } - } - - public List<EscherRecord> getEscherRecords() { - return escherRecords; - } - - @Override - public String toString() { - StringBuilder buffer = new StringBuilder(); - - if (escherRecords.size() == 0) { - buffer.append("No Escher Records Decoded").append("\n"); - } - Iterator<EscherRecord> iterator = escherRecords.iterator(); - while (iterator.hasNext()) { - EscherRecord r = iterator.next(); - buffer.append(r); - } - return buffer.toString(); - } - - /** - * If we have a EscherContainerRecord as one of our - * children (and most top level escher holders do), - * then return that. - */ - public EscherContainerRecord getEscherContainer() { - for(Iterator<EscherRecord> it = escherRecords.iterator(); it.hasNext();) { - Object er = it.next(); - if(er instanceof EscherContainerRecord) { - return (EscherContainerRecord)er; + pos += drawingGroupData.fillFields(data, pos, recordFactory); + assert drawingGroupData.getRecordId() == EscherRecordTypes.DGG_CONTAINER.typeID; + + /* + * After the drawingGroupData there is an array (2 slots max) that has data about drawings. According to the + * spec, the first slot is for the Main Document, the second for the Header Document. Additionally, the + * OfficeArtWordDrawing structure has a byte (dgglbl) that indicates whether the structure is for the Main or + * Header Document. In practice we've seen documents such as 61911.doc where the order of array entries does not + * match the dgglbl byte. As the byte is more likely to be reliable, we base the parsing off of that rather than + * array order. + */ + + // This should loop at most twice + while (pos < offset + size) { + + // Named this way to match section 2.9.172 of [MS-DOC] - v20191119. + byte dgglbl = data[pos]; + assert dgglbl == 0x00 || dgglbl == 0x01; + pos++; + + EscherContainerRecord dgContainer = new EscherContainerRecord(); + pos+= dgContainer.fillFields(data, pos, recordFactory); + assert dgContainer.getRecordId() == EscherRecordTypes.DG_CONTAINER.typeID; + + switch (dgglbl) { + case 0x00: + mainDocumentDgContainer = dgContainer; + break; + case 0x01: + headerDocumentDgContainer = dgContainer; + break; + default: + LogManager.getLogger(EscherRecordHolder.class).atWarn() + .log("dgglbl {} for OfficeArtWordDrawing is out of bounds [0, 1]", box(dgglbl)); } } - return null; - } - /** - * Descends into all our children, returning the - * first EscherRecord with the given id, or null - * if none found - */ - public EscherRecord findFirstWithId(short id) { - return findFirstWithId(id, getEscherRecords()); + assert pos == offset + size; } - private static EscherRecord findFirstWithId(short id, List<EscherRecord> records) { - // Check at our level - for(Iterator<EscherRecord> it = records.iterator(); it.hasNext();) { - EscherRecord r = it.next(); - if(r.getRecordId() == id) { - return r; - } - } - // Then check our children in turn - for(Iterator<EscherRecord> it = records.iterator(); it.hasNext();) { - EscherRecord r = it.next(); - if(r.isContainerRecord()) { - EscherRecord found = findFirstWithId(id, r.getChildRecords()); - if(found != null) { - return found; - } - } - } - - // Not found in this lot - return null; + public List<EscherRecord> getEscherRecords() { + return drawingGroupData.getChildRecords(); } - public List<? extends EscherContainerRecord> getDgContainers() - { - List<EscherContainerRecord> dgContainers = new ArrayList<>( - 1); - for ( EscherRecord escherRecord : getEscherRecords() ) - { - if ( escherRecord.getRecordId() == (short) 0xF002 ) - { - dgContainers.add( (EscherContainerRecord) escherRecord ); - } + public List<? extends EscherContainerRecord> getDgContainers() { + List<EscherContainerRecord> dgContainers = new ArrayList<>(2); + if (mainDocumentDgContainer != null) { + dgContainers.add(mainDocumentDgContainer); } - return dgContainers; - } - - public List<? extends EscherContainerRecord> getDggContainers() - { - List<EscherContainerRecord> dggContainers = new ArrayList<>( - 1); - for ( EscherRecord escherRecord : getEscherRecords() ) - { - if ( escherRecord.getRecordId() == (short) 0xF000 ) - { - dggContainers.add( (EscherContainerRecord) escherRecord ); - } + if (headerDocumentDgContainer != null) { + dgContainers.add(headerDocumentDgContainer); } - return dggContainers; + return dgContainers; } public List<? extends EscherContainerRecord> getBStoreContainers() { List<EscherContainerRecord> bStoreContainers = new ArrayList<>( 1); - for ( EscherContainerRecord dggContainer : getDggContainers() ) - { - for ( EscherRecord escherRecord : dggContainer.getChildRecords() ) - { - if ( escherRecord.getRecordId() == (short) 0xF001 ) - { - bStoreContainers.add( (EscherContainerRecord) escherRecord ); - } - } - } + for ( EscherRecord escherRecord : drawingGroupData.getChildRecords() ) + { + if ( escherRecord.getRecordId() == (short) 0xF001 ) + { + bStoreContainers.add( (EscherContainerRecord) escherRecord ); + } + } return bStoreContainers; } @@ -206,4 +177,13 @@ public final class EscherRecordHolder { } return spContainers; } + + @Override + public String toString() { + return "EscherRecordHolder{" + + "drawingGroupData=" + drawingGroupData + + ", mainDocumentDgContainer=" + mainDocumentDgContainer + + ", headerDocumentDgContainer=" + headerDocumentDgContainer + + '}'; + } } |