From: Sergey Vladimirov Date: Thu, 28 Jul 2011 15:08:06 +0000 (+0000) Subject: add simpliest "escher" pictures support in Word-to-HTML and Word-to-FO converters X-Git-Tag: REL_3_8_BETA4~51 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=984ad2c749f4889fb504e31e0b9d583b22d1bb90;p=poi.git add simpliest "escher" pictures support in Word-to-HTML and Word-to-FO converters git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1151888 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index b2d4660459..fa8a47649c 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -310,8 +310,8 @@ public final class HWPFDocument extends HWPFDocumentCore _officeArts = new ShapesTable(_tableStream, _fib); // And escher pictures - _officeDrawingsHeaders = new OfficeDrawingsImpl( _fspaHeaders, _escherRecordHolder ); - _officeDrawingsMain = new OfficeDrawingsImpl( _fspaMain , _escherRecordHolder); + _officeDrawingsHeaders = new OfficeDrawingsImpl( _fspaHeaders, _escherRecordHolder, _mainStream ); + _officeDrawingsMain = new OfficeDrawingsImpl( _fspaMain , _escherRecordHolder, _mainStream); _st = new SectionTable(_mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, _tpt, _cpSplit); _ss = new StyleSheet(_tableStream, _fib.getFcStshf()); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java index b65e8cf713..afb5a5f701 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java @@ -41,6 +41,7 @@ import org.apache.poi.hwpf.usermodel.Notes; import org.apache.poi.hwpf.usermodel.OfficeDrawing; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.hwpf.usermodel.Section; import org.apache.poi.hwpf.usermodel.Table; @@ -578,7 +579,10 @@ public abstract class AbstractWordConverter protected void processDrawnObject( HWPFDocument doc, CharacterRun characterRun, Element block ) { - // main? + if ( getPicturesManager() == null ) + return; + + // TODO: support headers OfficeDrawing officeDrawing = doc.getOfficeDrawingsMain() .getOfficeDrawingAt( characterRun.getStartOffset() ); if ( officeDrawing == null ) @@ -588,10 +592,22 @@ public abstract class AbstractWordConverter return; } - // TODO: do something :) + byte[] pictureData = officeDrawing.getPictureData(); + if ( pictureData == null ) + // usual shape? + return; + final PictureType type = PictureType.findMatchingType( pictureData ); + String path = getPicturesManager().savePicture( pictureData, type, + "s" + characterRun.getStartOffset() + "." + type ); + + processDrawnObject( doc, characterRun, officeDrawing, path, block ); } + protected abstract void processDrawnObject( HWPFDocument doc, + CharacterRun characterRun, OfficeDrawing officeDrawing, + String path, Element block ); + protected abstract void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex, Element block, Range endnoteTextRange ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java index a6e38073ce..a02b2942c5 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java @@ -113,6 +113,13 @@ public class HtmlDocumentFacade return basicLink; } + public Element createImage( String src ) + { + Element result = document.createElement( "img" ); + result.setAttribute( "src", src ); + return result; + } + public Element createLineBreak() { return document.createElement( "br" ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/PicturesManager.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/PicturesManager.java index ac408139cb..dbbca57003 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/PicturesManager.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/PicturesManager.java @@ -16,7 +16,6 @@ ==================================================================== */ package org.apache.poi.hwpf.converter; -import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.PictureType; /** @@ -35,11 +34,11 @@ public interface PicturesManager * {@link PictureType#WMF}. FO (Apache FOP) supports at least PNG and SVG * types. * - * @param picture - * Word picture + * @param content + * picture content * @return path to file that can be used as reference in HTML (img's src) of * XLS FO (fo:external-graphic's src) or null if image were * not saved and should not be referenced from result HTML / FO. */ - String savePicture( Picture picture ); + String savePicture( byte[] content, PictureType pictureType, String suggestedName ); } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java index 8121e2cd9c..653b44cec9 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java @@ -37,6 +37,7 @@ import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.converter.FontReplacer.Triplet; import org.apache.poi.hwpf.usermodel.Bookmark; import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.OfficeDrawing; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Range; @@ -262,6 +263,16 @@ public class WordToFoConverter extends AbstractWordConverter foDocumentFacade.setDescription( summaryInformation.getComments() ); } + @Override + protected void processDrawnObject( HWPFDocument doc, + CharacterRun characterRun, OfficeDrawing officeDrawing, + String path, Element block ) + { + final Element externalGraphic = foDocumentFacade + .createExternalGraphic( path ); + block.appendChild( externalGraphic ); + } + @Override protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex, Element block, Range endnoteTextRange ) @@ -364,7 +375,10 @@ public class WordToFoConverter extends AbstractWordConverter PicturesManager fileManager = getPicturesManager(); if ( fileManager != null ) { - String url = fileManager.savePicture( picture ); + String url = fileManager + .savePicture( picture.getContent(), + picture.suggestPictureType(), + picture.suggestFullFileName() ); if ( WordToFoUtils.isNotEmpty( url ) ) { diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java index f17a76c130..beba077c79 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java @@ -34,6 +34,7 @@ import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.converter.FontReplacer.Triplet; import org.apache.poi.hwpf.usermodel.Bookmark; import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.OfficeDrawing; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Range; @@ -61,6 +62,22 @@ import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH; public class WordToHtmlConverter extends AbstractWordConverter { + /** + * Holds properties values, applied to current p element. Those + * properties shall not be doubled in children span elements. + */ + private static class BlockProperies + { + final String pFontName; + final int pFontSize; + + public BlockProperies( String pFontName, int pFontSize ) + { + this.pFontName = pFontName; + this.pFontSize = pFontSize; + } + } + private static final POILogger logger = POILogFactory .getLogger( WordToHtmlConverter.class ); @@ -253,6 +270,15 @@ public class WordToHtmlConverter extends AbstractWordConverter .addDescription( summaryInformation.getComments() ); } + @Override + protected void processDrawnObject( HWPFDocument doc, + CharacterRun characterRun, OfficeDrawing officeDrawing, + String path, Element block ) + { + Element img = htmlDocumentFacade.createImage( path ); + block.appendChild( img ); + } + @Override protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex, Element block, Range endnoteTextRange ) @@ -302,7 +328,10 @@ public class WordToHtmlConverter extends AbstractWordConverter PicturesManager fileManager = getPicturesManager(); if ( fileManager != null ) { - String url = fileManager.savePicture( picture ); + String url = fileManager + .savePicture( picture.getContent(), + picture.suggestPictureType(), + picture.suggestFullFileName() ); if ( WordToHtmlUtils.isNotEmpty( url ) ) { @@ -388,8 +417,7 @@ public class WordToHtmlConverter extends AbstractWordConverter + "in;overflow:hidden;" ) ); root.appendChild( inner ); - Element image = htmlDocumentFacade.document.createElement( "img" ); - image.setAttribute( "src", imageSourcePath ); + Element image = htmlDocumentFacade.createImage( imageSourcePath ); image.setAttribute( "class", htmlDocumentFacade .getOrCreateCssClass( image.getTagName(), "i", "position:absolute;left:-" + cropLeft + ";top:-" @@ -401,8 +429,7 @@ public class WordToHtmlConverter extends AbstractWordConverter } else { - root = htmlDocumentFacade.document.createElement( "img" ); - root.setAttribute( "src", imageSourcePath ); + root = htmlDocumentFacade.createImage( imageSourcePath ); root.setAttribute( "style", "width:" + imageWidth + "in;height:" + imageHeight + "in;vertical-align:text-bottom;" ); } @@ -691,20 +718,4 @@ public class WordToHtmlConverter extends AbstractWordConverter } } - /** - * Holds properties values, applied to current p element. Those - * properties shall not be doubled in children span elements. - */ - private static class BlockProperies - { - final String pFontName; - final int pFontSize; - - public BlockProperies( String pFontName, int pFontSize ) - { - this.pFontName = pFontName; - this.pFontSize = pFontSize; - } - } - } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java b/src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java index 310dbc52c3..329707832f 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/EscherRecordHolder.java @@ -20,6 +20,7 @@ package org.apache.poi.hwpf.model; import java.util.ArrayList; import java.util.Iterator; import java.util.List; + import org.apache.poi.ddf.DefaultEscherRecordFactory; import org.apache.poi.ddf.EscherContainerRecord; import org.apache.poi.ddf.EscherRecord; @@ -119,4 +120,83 @@ public final class EscherRecordHolder { // Not found in this lot return null; } + + public List getDgContainers() + { + List dgContainers = new ArrayList( + 1 ); + for ( EscherRecord escherRecord : getEscherRecords() ) + { + if ( escherRecord.getRecordId() == (short) 0xF002 ) + { + dgContainers.add( (EscherContainerRecord) escherRecord ); + } + } + return dgContainers; + } + + public List getDggContainers() + { + List dggContainers = new ArrayList( + 1 ); + for ( EscherRecord escherRecord : getEscherRecords() ) + { + if ( escherRecord.getRecordId() == (short) 0xF000 ) + { + dggContainers.add( (EscherContainerRecord) escherRecord ); + } + } + return dggContainers; + } + + public List getBStoreContainers() + { + List bStoreContainers = new ArrayList( + 1 ); + for ( EscherContainerRecord dggContainer : getDggContainers() ) + { + for ( EscherRecord escherRecord : dggContainer.getChildRecords() ) + { + if ( escherRecord.getRecordId() == (short) 0xF001 ) + { + bStoreContainers.add( (EscherContainerRecord) escherRecord ); + } + } + } + return bStoreContainers; + } + + public List getSpgrContainers() + { + List spgrContainers = new ArrayList( + 1 ); + for ( EscherContainerRecord dgContainer : getDgContainers() ) + { + for ( EscherRecord escherRecord : dgContainer.getChildRecords() ) + { + if ( escherRecord.getRecordId() == (short) 0xF003 ) + { + spgrContainers.add( (EscherContainerRecord) escherRecord ); + } + } + } + return spgrContainers; + } + + public List getSpContainers() + { + List spContainers = new ArrayList( + 1 ); + for ( EscherContainerRecord spgrContainer : getSpgrContainers() ) + { + for ( EscherRecord escherRecord : spgrContainer.getChildRecords() ) + { + if ( escherRecord.getRecordId() == (short) 0xF004 ) + { + spContainers.add( (EscherContainerRecord) escherRecord ); + } + } + } + return spContainers; + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawing.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawing.java index 20e3cc5222..002fa30b83 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawing.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawing.java @@ -1,21 +1,43 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ package org.apache.poi.hwpf.usermodel; +/** + * User-friendly interface to office drawing objects + * + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ public interface OfficeDrawing { /** - * Shape Identifier + * Returns picture data if this shape has (single?) associated picture data */ - int getShapeId(); + byte[] getPictureData(); /** - * Left of rectangle enclosing shape relative to the origin of the shape + * Bottom of the rectangle enclosing shape relative to the origin of the + * shape */ - int getRectangleLeft(); + int getRectangleBottom(); /** - * Top of rectangle enclosing shape relative to the origin of the shape + * Left of rectangle enclosing shape relative to the origin of the shape */ - int getRectangleTop(); + int getRectangleLeft(); /** * Right of rectangle enclosing shape relative to the origin of the shape @@ -23,9 +45,13 @@ public interface OfficeDrawing int getRectangleRight(); /** - * Bottom of the rectangle enclosing shape relative to the origin of the - * shape + * Top of rectangle enclosing shape relative to the origin of the shape */ - int getRectangleBottom(); + int getRectangleTop(); + + /** + * Shape Identifier + */ + int getShapeId(); } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawings.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawings.java index 672d5e193e..58661aa8dd 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawings.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawings.java @@ -1,7 +1,28 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ package org.apache.poi.hwpf.usermodel; import java.util.Collection; +/** + * User-friendly interface to access document part's office drawings + * + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ public interface OfficeDrawings { OfficeDrawing getOfficeDrawingAt( int characterPosition ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawingsImpl.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawingsImpl.java index 933821c147..370cacd707 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawingsImpl.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/OfficeDrawingsImpl.java @@ -1,3 +1,19 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ package org.apache.poi.hwpf.usermodel; import java.util.ArrayList; @@ -5,6 +21,16 @@ import java.util.Collection; import java.util.Collections; import java.util.List; +import org.apache.poi.ddf.DefaultEscherRecordFactory; +import org.apache.poi.ddf.EscherBSERecord; +import org.apache.poi.ddf.EscherBlipRecord; +import org.apache.poi.ddf.EscherContainerRecord; +import org.apache.poi.ddf.EscherOptRecord; +import org.apache.poi.ddf.EscherProperties; +import org.apache.poi.ddf.EscherRecord; +import org.apache.poi.ddf.EscherRecordFactory; +import org.apache.poi.ddf.EscherSimpleProperty; +import org.apache.poi.ddf.EscherSpRecord; import org.apache.poi.hwpf.model.EscherRecordHolder; import org.apache.poi.hwpf.model.FSPA; import org.apache.poi.hwpf.model.FSPATable; @@ -13,12 +39,82 @@ public class OfficeDrawingsImpl implements OfficeDrawings { private final EscherRecordHolder _escherRecordHolder; private final FSPATable _fspaTable; + private final byte[] _mainStream; public OfficeDrawingsImpl( FSPATable fspaTable, - EscherRecordHolder escherRecordHolder ) + EscherRecordHolder escherRecordHolder, byte[] mainStream ) { this._fspaTable = fspaTable; this._escherRecordHolder = escherRecordHolder; + this._mainStream = mainStream; + } + + private EscherContainerRecord getEscherShapeRecordContainer( + final int shapeId ) + { + for ( EscherContainerRecord spContainer : _escherRecordHolder + .getSpContainers() ) + { + EscherSpRecord escherSpRecord = spContainer + .getChildById( (short) 0xF00A ); + if ( escherSpRecord != null + && escherSpRecord.getShapeId() == shapeId ) + return spContainer; + } + + return null; + } + + private EscherBlipRecord getBitmapRecord( int bitmapIndex ) + { + List bContainers = _escherRecordHolder + .getBStoreContainers(); + if ( bContainers == null || bContainers.size() != 1 ) + return null; + + EscherContainerRecord bContainer = bContainers.get( 0 ); + final List bitmapRecords = bContainer.getChildRecords(); + + if ( bitmapRecords.size() < bitmapIndex ) + return null; + + EscherRecord imageRecord = bitmapRecords.get( bitmapIndex - 1 ); + + if ( imageRecord instanceof EscherBlipRecord ) + { + return (EscherBlipRecord) imageRecord; + } + + if ( imageRecord instanceof EscherBSERecord ) + { + EscherBSERecord bseRecord = (EscherBSERecord) imageRecord; + + EscherBlipRecord blip = bseRecord.getBlipRecord(); + if ( blip != null ) + { + return blip; + } + + if ( bseRecord.getOffset() > 0 ) + { + /* + * Blip stored in delay stream, which in a word doc, is the main + * stream + */ + EscherRecordFactory recordFactory = new DefaultEscherRecordFactory(); + EscherRecord record = recordFactory.createRecord( _mainStream, + bseRecord.getOffset() ); + + if ( record instanceof EscherBlipRecord ) + { + record.fillFields( _mainStream, bseRecord.getOffset(), + recordFactory ); + return (EscherBlipRecord) record; + } + } + } + + return null; } private OfficeDrawing getOfficeDrawing( final FSPA fspa ) @@ -50,6 +146,30 @@ public class OfficeDrawingsImpl implements OfficeDrawings return fspa.getSpid(); } + public byte[] getPictureData() + { + EscherContainerRecord shapeDescription = getEscherShapeRecordContainer( getShapeId() ); + if ( shapeDescription == null ) + return null; + + EscherOptRecord escherOptRecord = shapeDescription + .getChildById( (short) 0xF00B ); + if ( escherOptRecord == null ) + return null; + + EscherSimpleProperty escherProperty = escherOptRecord + .lookup( EscherProperties.BLIP__BLIPTODISPLAY ); + if ( escherProperty == null ) + return null; + + int bitmapIndex = escherProperty.getPropertyValue(); + EscherBlipRecord escherBlipRecord = getBitmapRecord( bitmapIndex ); + if ( escherBlipRecord == null ) + return null; + + return escherBlipRecord.getPicturedata(); + } + @Override public String toString() { diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.java index b42ad00637..2781b76519 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.java @@ -30,19 +30,21 @@ import org.apache.poi.util.POILogger; /** * Represents embedded picture extracted from Word Document + * * @author Dmitry Romanov */ public final class Picture extends PictureDescriptor { - private static final POILogger log = POILogFactory.getLogger(Picture.class); + private static final POILogger log = POILogFactory + .getLogger( Picture.class ); -// public static final int FILENAME_OFFSET = 0x7C; -// public static final int FILENAME_SIZE_OFFSET = 0x6C; - static final int PICF_OFFSET = 0x0; - static final int PICT_HEADER_OFFSET = 0x4; - static final int MFPMM_OFFSET = 0x6; - static final int PICF_SHAPE_OFFSET = 0xE; - static final int UNKNOWN_HEADER_SIZE = 0x49; + // public static final int FILENAME_OFFSET = 0x7C; + // public static final int FILENAME_SIZE_OFFSET = 0x6C; + static final int PICF_OFFSET = 0x0; + static final int PICT_HEADER_OFFSET = 0x4; + static final int MFPMM_OFFSET = 0x6; + static final int PICF_SHAPE_OFFSET = 0xE; + static final int UNKNOWN_HEADER_SIZE = 0x49; @Deprecated public static final byte[] GIF = PictureType.GIF.getSignatures()[0]; @@ -66,52 +68,57 @@ public final class Picture extends PictureDescriptor public static final byte[] WMF2 = PictureType.WMF.getSignatures()[1]; // TODO: DIB, PICT - public static final byte[] IHDR = new byte[]{'I', 'H', 'D', 'R'}; + public static final byte[] IHDR = new byte[] { 'I', 'H', 'D', 'R' }; - public static final byte[] COMPRESSED1 = { (byte)0xFE, 0x78, (byte)0xDA }; - public static final byte[] COMPRESSED2 = { (byte)0xFE, 0x78, (byte)0x9C }; + public static final byte[] COMPRESSED1 = { (byte) 0xFE, 0x78, (byte) 0xDA }; + public static final byte[] COMPRESSED2 = { (byte) 0xFE, 0x78, (byte) 0x9C }; - private int dataBlockStartOfsset; - private int pictureBytesStartOffset; - private int dataBlockSize; - private int size; -// private String fileName; - private byte[] rawContent; - private byte[] content; - private byte[] _dataStream; - private int height = -1; - private int width = -1; + private int dataBlockStartOfsset; + private int pictureBytesStartOffset; + private int dataBlockSize; + private int size; + // private String fileName; + private byte[] rawContent; + private byte[] content; + private byte[] _dataStream; + private int height = -1; + private int width = -1; - public Picture(int dataBlockStartOfsset, byte[] _dataStream, boolean fillBytes) - { - super (_dataStream, dataBlockStartOfsset); - - this._dataStream = _dataStream; - this.dataBlockStartOfsset = dataBlockStartOfsset; - this.dataBlockSize = LittleEndian.getInt(_dataStream, dataBlockStartOfsset); - this.pictureBytesStartOffset = getPictureBytesStartOffset(dataBlockStartOfsset, _dataStream, dataBlockSize); - this.size = dataBlockSize - (pictureBytesStartOffset - dataBlockStartOfsset); + public Picture( int dataBlockStartOfsset, byte[] _dataStream, + boolean fillBytes ) + { + super( _dataStream, dataBlockStartOfsset ); + + this._dataStream = _dataStream; + this.dataBlockStartOfsset = dataBlockStartOfsset; + this.dataBlockSize = LittleEndian.getInt( _dataStream, + dataBlockStartOfsset ); + this.pictureBytesStartOffset = getPictureBytesStartOffset( + dataBlockStartOfsset, _dataStream, dataBlockSize ); + this.size = dataBlockSize + - ( pictureBytesStartOffset - dataBlockStartOfsset ); + + if ( size < 0 ) + { - if (size<0) { + } + if ( fillBytes ) + { + fillImageContent(); + } } - if (fillBytes) + public Picture( byte[] _dataStream ) { - fillImageContent(); - } - } - - public Picture(byte[] _dataStream) - { super(); - this._dataStream = _dataStream; - this.dataBlockStartOfsset = 0; - this.dataBlockSize = _dataStream.length; - this.pictureBytesStartOffset = 0; - this.size = _dataStream.length; - } + this._dataStream = _dataStream; + this.dataBlockStartOfsset = 0; + this.dataBlockSize = _dataStream.length; + this.pictureBytesStartOffset = 0; + this.size = _dataStream.length; + } private void fillWidthHeight() { @@ -131,42 +138,50 @@ public final class Picture extends PictureDescriptor } } - /** - * Tries to suggest a filename: hex representation of picture structure offset in "Data" stream plus extension that - * is tried to determine from first byte of picture's content. - * - * @return suggested file name - */ - public String suggestFullFileName() - { - String fileExt = suggestFileExtension(); - return Integer.toHexString(dataBlockStartOfsset) + (fileExt.length()>0 ? "."+fileExt : ""); - } - - /** - * Writes Picture's content bytes to specified OutputStream. - * Is useful when there is need to write picture bytes directly to stream, omitting its representation in - * memory as distinct byte array. - * - * @param out a stream to write to - * @throws IOException if some exception is occured while writing to specified out - */ - public void writeImageContent(OutputStream out) throws IOException - { - if (rawContent!=null && rawContent.length>0) { - out.write(rawContent, 0, size); - } else { - out.write(_dataStream, pictureBytesStartOffset, size); - } - } - - /** - * @return The offset of this picture in the picture bytes, used - * when matching up with {@link CharacterRun#getPicOffset()} - */ - public int getStartOffset() { - return dataBlockStartOfsset; - } + /** + * Tries to suggest a filename: hex representation of picture structure + * offset in "Data" stream plus extension that is tried to determine from + * first byte of picture's content. + * + * @return suggested file name + */ + public String suggestFullFileName() + { + String fileExt = suggestFileExtension(); + return Integer.toHexString( dataBlockStartOfsset ) + + ( fileExt.length() > 0 ? "." + fileExt : "" ); + } + + /** + * Writes Picture's content bytes to specified OutputStream. Is useful when + * there is need to write picture bytes directly to stream, omitting its + * representation in memory as distinct byte array. + * + * @param out + * a stream to write to + * @throws IOException + * if some exception is occured while writing to specified out + */ + public void writeImageContent( OutputStream out ) throws IOException + { + if ( rawContent != null && rawContent.length > 0 ) + { + out.write( rawContent, 0, size ); + } + else + { + out.write( _dataStream, pictureBytesStartOffset, size ); + } + } + + /** + * @return The offset of this picture in the picture bytes, used when + * matching up with {@link CharacterRun#getPicOffset()} + */ + public int getStartOffset() + { + return dataBlockStartOfsset; + } /** * @return picture's content as byte array @@ -189,14 +204,14 @@ public final class Picture extends PictureDescriptor return rawContent; } - /** - * - * @return size in bytes of the picture - */ - public int getSize() - { - return size; - } + /** + * + * @return size in bytes of the picture + */ + public int getSize() + { + return size; + } /** * @return the horizontal aspect ratio for picture provided by user @@ -236,48 +251,56 @@ public final class Picture extends PictureDescriptor } /** - * Gets the initial width of the picture, in twips, prior to cropping or scaling. - * + * Gets the initial width of the picture, in twips, prior to cropping or + * scaling. + * * @return the initial width of the picture in twips */ - public int getDxaGoal() { + public int getDxaGoal() + { return dxaGoal; } /** - * Gets the initial height of the picture, in twips, prior to cropping or scaling. - * + * Gets the initial height of the picture, in twips, prior to cropping or + * scaling. + * * @return the initial width of the picture in twips */ - public int getDyaGoal() { + public int getDyaGoal() + { return dyaGoal; } /** * @return The amount the picture has been cropped on the left in twips */ - public int getDxaCropLeft() { + public int getDxaCropLeft() + { return dxaCropLeft; } /** * @return The amount the picture has been cropped on the top in twips */ - public int getDyaCropTop() { + public int getDyaCropTop() + { return dyaCropTop; } /** * @return The amount the picture has been cropped on the right in twips */ - public int getDxaCropRight() { + public int getDxaCropRight() + { return dxaCropRight; } /** * @return The amount the picture has been cropped on the bottom in twips */ - public int getDyaCropBottom() { + public int getDyaCropBottom() + { return dyaCropBottom; } @@ -304,225 +327,255 @@ public final class Picture extends PictureDescriptor public PictureType suggestPictureType() { - final byte[] imageContent = getContent(); - for ( PictureType pictureType : PictureType.values() ) - for ( byte[] signature : pictureType.getSignatures() ) - if ( matchSignature( imageContent, signature, 0 ) ) - return pictureType; - - // TODO: DIB, PICT - return PictureType.UNKNOWN; - } - - private static boolean matchSignature(byte[] dataStream, byte[] signature, int pictureBytesOffset) - { - boolean matched = pictureBytesOffset < dataStream.length; - for (int i = 0; (i+pictureBytesOffset) < dataStream.length && i < signature.length; i++) - { - if (dataStream[i+pictureBytesOffset] != signature[i]) - { - matched = false; - break; - } - } - return matched; - } - -// public String getFileName() -// { -// return fileName; -// } - -// private static String extractFileName(int blockStartIndex, byte[] dataStream) { -// int fileNameStartOffset = blockStartIndex + 0x7C; -// int fileNameSizeOffset = blockStartIndex + FILENAME_SIZE_OFFSET; -// int fileNameSize = LittleEndian.getShort(dataStream, fileNameSizeOffset); -// -// int fileNameIndex = fileNameStartOffset; -// char[] fileNameChars = new char[(fileNameSize-1)/2]; -// int charIndex = 0; -// while(charIndex 0 ) return; - this.rawContent = new byte[size]; - System.arraycopy(_dataStream, pictureBytesStartOffset, rawContent, 0, size); - } + this.rawContent = new byte[size]; + System.arraycopy( _dataStream, pictureBytesStartOffset, rawContent, 0, + size ); + } - private void fillImageContent() - { + private void fillImageContent() + { if ( content != null && content.length > 0 ) return; - byte[] rawContent = getRawContent(); - - // HACK: Detect compressed images. In reality there should be some way to determine - // this from the first 32 bytes, but I can't see any similarity between all the - // samples I have obtained, nor any similarity in the data block contents. - if (matchSignature(rawContent, COMPRESSED1, 32) || matchSignature(rawContent, COMPRESSED2, 32)) - { - try - { - InflaterInputStream in = new InflaterInputStream( - new ByteArrayInputStream(rawContent, 33, rawContent.length - 33)); - ByteArrayOutputStream out = new ByteArrayOutputStream(); - byte[] buf = new byte[4096]; - int readBytes; - while ((readBytes = in.read(buf)) > 0) + byte[] rawContent = getRawContent(); + + // HACK: Detect compressed images. In reality there should be some way + // to determine + // this from the first 32 bytes, but I can't see any similarity between + // all the + // samples I have obtained, nor any similarity in the data block + // contents. + if ( matchSignature( rawContent, COMPRESSED1, 32 ) + || matchSignature( rawContent, COMPRESSED2, 32 ) ) + { + try + { + InflaterInputStream in = new InflaterInputStream( + new ByteArrayInputStream( rawContent, 33, + rawContent.length - 33 ) ); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + byte[] buf = new byte[4096]; + int readBytes; + while ( ( readBytes = in.read( buf ) ) > 0 ) + { + out.write( buf, 0, readBytes ); + } + content = out.toByteArray(); + } + catch ( IOException e ) + { + // Problems reading from the actual ByteArrayInputStream should + // never happen + // so this will only ever be a ZipException. + log.log( POILogger.INFO, + "Possibly corrupt compression or non-compressed data", + e ); + } + } + else + { + // Raw data is not compressed. + content = rawContent; + } + } + + private static boolean matchSignature( byte[] pictureData, + byte[] signature, int offset ) + { + boolean matched = offset < pictureData.length; + for ( int i = 0; ( i + offset ) < pictureData.length + && i < signature.length; i++ ) + { + if ( pictureData[i + offset] != signature[i] ) + { + matched = false; + break; + } + } + return matched; + } + + private static int getPictureBytesStartOffset( int dataBlockStartOffset, + byte[] _dataStream, int dataBlockSize ) + { + int realPicoffset = dataBlockStartOffset; + final int dataBlockEndOffset = dataBlockSize + dataBlockStartOffset; + + // Skip over the PICT block + int PICTFBlockSize = LittleEndian.getShort( _dataStream, + dataBlockStartOffset + PICT_HEADER_OFFSET ); // Should be 68 + // bytes + + // Now the PICTF1 + int PICTF1BlockOffset = PICTFBlockSize + PICT_HEADER_OFFSET; + short MM_TYPE = LittleEndian.getShort( _dataStream, + dataBlockStartOffset + PICT_HEADER_OFFSET + 2 ); + if ( MM_TYPE == 0x66 ) + { + // Skip the stPicName + int cchPicName = LittleEndian.getUnsignedByte( _dataStream, + PICTF1BlockOffset ); + PICTF1BlockOffset += 1 + cchPicName; + } + int PICTF1BlockSize = LittleEndian.getShort( _dataStream, + dataBlockStartOffset + PICTF1BlockOffset ); + + int unknownHeaderOffset = ( PICTF1BlockSize + PICTF1BlockOffset ) < dataBlockEndOffset ? ( PICTF1BlockSize + PICTF1BlockOffset ) + : PICTF1BlockOffset; + realPicoffset += ( unknownHeaderOffset + UNKNOWN_HEADER_SIZE ); + if ( realPicoffset >= dataBlockEndOffset ) { - out.write(buf, 0, readBytes); + realPicoffset -= UNKNOWN_HEADER_SIZE; } - content = out.toByteArray(); - } - catch (IOException e) - { - // Problems reading from the actual ByteArrayInputStream should never happen - // so this will only ever be a ZipException. - log.log(POILogger.INFO, "Possibly corrupt compression or non-compressed data", e); - } - } else { - // Raw data is not compressed. - content = rawContent; - } - } - - private static int getPictureBytesStartOffset(int dataBlockStartOffset, byte[] _dataStream, int dataBlockSize) - { - int realPicoffset = dataBlockStartOffset; - final int dataBlockEndOffset = dataBlockSize + dataBlockStartOffset; - - // Skip over the PICT block - int PICTFBlockSize = LittleEndian.getShort(_dataStream, dataBlockStartOffset +PICT_HEADER_OFFSET); // Should be 68 bytes - - // Now the PICTF1 - int PICTF1BlockOffset = PICTFBlockSize + PICT_HEADER_OFFSET; - short MM_TYPE = LittleEndian.getShort(_dataStream, dataBlockStartOffset + PICT_HEADER_OFFSET + 2); - if(MM_TYPE == 0x66) { - // Skip the stPicName - int cchPicName = LittleEndian.getUnsignedByte(_dataStream, PICTF1BlockOffset); - PICTF1BlockOffset += 1 + cchPicName; - } - int PICTF1BlockSize = LittleEndian.getShort(_dataStream, dataBlockStartOffset +PICTF1BlockOffset); - - int unknownHeaderOffset = (PICTF1BlockSize + PICTF1BlockOffset) < dataBlockEndOffset ? (PICTF1BlockSize + PICTF1BlockOffset) : PICTF1BlockOffset; - realPicoffset += (unknownHeaderOffset + UNKNOWN_HEADER_SIZE); - if (realPicoffset>=dataBlockEndOffset) { - realPicoffset -= UNKNOWN_HEADER_SIZE; - } - return realPicoffset; - } - - private void fillJPGWidthHeight() { - /* - http://www.codecomments.com/archive281-2004-3-158083.html - - Algorhitm proposed by Patrick TJ McPhee: - - read 2 bytes - make sure they are 'ffd8'x - repeatedly: - read 2 bytes - make sure the first one is 'ff'x - if the second one is 'd9'x stop - else if the second one is c0 or c2 (or possibly other values ...) - skip 2 bytes - read one byte into depth - read two bytes into height - read two bytes into width - else - read two bytes into length - skip forward length-2 bytes - - Also used Ruby code snippet from: http://www.bigbold.com/snippets/posts/show/805 for reference - */ - int pointer = pictureBytesStartOffset+2; - int firstByte = _dataStream[pointer]; - int secondByte = _dataStream[pointer+1]; - - int endOfPicture = pictureBytesStartOffset + size; - while(pointer" ); } - public void testPicture() throws Exception - { - String result = getHtmlText( "picture.doc", true ); - - // picture - assertContains( result, "src=\"picture.bin\"" ); - // visible size - assertContains( result, "width:3.1305554in;height:1.7250001in;" ); - // shift due to crop - assertContains( result, "left:-0.09375;top:-0.25694445;" ); - // size without crop - assertContains( result, "width:3.4125in;height:2.325in;" ); - } - public void testHyperlink() throws Exception { String result = getHtmlText( "hyperlink.doc" ); @@ -201,14 +191,6 @@ public class TestWordToHtmlConverter extends TestCase getHtmlText( "innertable.doc" ); } - public void testTableMerges() throws Exception - { - String result = getHtmlText( "table-merges.doc" ); - - assertContains( result, "" ); - assertContains( result, "" ); - } - public void testO_kurs_doc() throws Exception { getHtmlText( "o_kurs.doc" ); @@ -222,4 +204,33 @@ public class TestWordToHtmlConverter extends TestCase assertContains( result, "" ); assertContains( result, "1" ); } + + public void testPicture() throws Exception + { + String result = getHtmlText( "picture.doc", true ); + + // picture + assertContains( result, "src=\"0.emf\"" ); + // visible size + assertContains( result, "width:3.1305554in;height:1.7250001in;" ); + // shift due to crop + assertContains( result, "left:-0.09375;top:-0.25694445;" ); + // size without crop + assertContains( result, "width:3.4125in;height:2.325in;" ); + } + + public void testPicturesEscher() throws Exception + { + String result = getHtmlText( "pictures_escher.doc", true ); + assertContains( result, "" ); + assertContains( result, "" ); + } + + public void testTableMerges() throws Exception + { + String result = getHtmlText( "table-merges.doc" ); + + assertContains( result, "" ); + assertContains( result, "" ); + } } diff --git a/test-data/document/pictures_escher.doc b/test-data/document/pictures_escher.doc new file mode 100644 index 0000000000..4870bc7ab2 Binary files /dev/null and b/test-data/document/pictures_escher.doc differ