--- /dev/null
+/* ====================================================================
+ Copyright 2002-2006 Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Picture;
+
+import java.util.List;
+import java.util.ArrayList;
+
+
+/**
+ * Holds information about all pictures embedded in Word Document either via "Insert -> Picture -> From File" or via
+ * clipboard. Responsible for images extraction and determining whether some document�s piece contains embedded image.
+ * Analyzes raw data bytestream �Data� (where Word stores all embedded objects) provided by HWPFDocument.
+ *
+ * Word stores images as is within so called "Data stream" - the stream within a Word docfile containing various data
+ * that hang off of characters in the main stream. For example, binary data describing in-line pictures and/or
+ * formfields an also embedded objects-native data. Word picture structures are concatenated one after the other in
+ * the data stream if the document contains pictures.
+ * Data stream is easily reachable via HWPFDocument._dataStream property.
+ * A picture is represented in the document text stream as a special character, an Unicode \u0001 whose
+ * CharacterRun.isSpecial() returns true. The file location of the picture in the Word binary file is accessed
+ * via CharacterRun.getPicOffset(). The CharacterRun.getPicOffset() is a byte offset into the data stream.
+ * Beginning at the position recorded in picOffset, a header data structure, will be stored.
+ *
+ * @author Dmitry Romanov
+ */
+public class PicturesTable
+{
+ static final int TYPE_IMAGE = 0x08;
+ static final int TYPE_IMAGE_WORD2000 = 0x00;
+ static final int TYPE_IMAGE_PASTED_FROM_CLIPBOARD = 0xA;
+ static final int TYPE_IMAGE_PASTED_FROM_CLIPBOARD_WORD2000 = 0x2;
+ static final int TYPE_HORIZONTAL_LINE = 0xE;
+ static final int BLOCK_TYPE_OFFSET = 0xE;
+ static final int MM_MODE_TYPE_OFFSET = 0x6;
+
+ private byte[] _dataStream;
+
+ /** @link dependency
+ * @stereotype instantiate*/
+ /*# Picture lnkPicture; */
+
+ /**
+ *
+ * @param _dataStream
+ */
+ public PicturesTable(byte[] _dataStream)
+ {
+ this._dataStream = _dataStream;
+ }
+
+ /**
+ * determines whether specified CharacterRun contains reference to a picture
+ * @param run
+ */
+ public boolean hasPicture(CharacterRun run) {
+ if (run.isSpecialCharacter() && !run.isObj() && !run.isOle2() && !run.isData() && "\u0001".equals(run.text())) {
+ return isBlockContainsImage(run.getPicOffset());
+ }
+ return false;
+ }
+
+ /**
+ * determines whether specified CharacterRun contains reference to a picture
+ * @param run
+ */
+ public boolean hasHorizontalLine(CharacterRun run) {
+ if (run.isSpecialCharacter() && "\u0001".equals(run.text())) {
+ return isBlockContainsHorizontalLine(run.getPicOffset());
+ }
+ return false;
+ }
+
+ private boolean isPictureRecognized(short blockType, short mappingModeOfMETAFILEPICT) {
+ return (blockType == TYPE_IMAGE || blockType == TYPE_IMAGE_PASTED_FROM_CLIPBOARD || (blockType==TYPE_IMAGE_WORD2000 && mappingModeOfMETAFILEPICT==0x64) || (blockType==TYPE_IMAGE_PASTED_FROM_CLIPBOARD_WORD2000 && mappingModeOfMETAFILEPICT==0x64));
+ }
+
+ private static short getBlockType(byte[] dataStream, int pictOffset) {
+ return LittleEndian.getShort(dataStream, pictOffset + BLOCK_TYPE_OFFSET);
+ }
+
+ private static short getMmMode(byte[] dataStream, int pictOffset) {
+ return LittleEndian.getShort(dataStream, pictOffset + MM_MODE_TYPE_OFFSET);
+ }
+
+ /**
+ * Returns picture object tied to specified CharacterRun
+ * @param run
+ * @param fillBytes if true, Picture will be returned with filled byte array that represent picture's contents. If you don't want
+ * to have that byte array in memory but only write picture's contents to stream, pass false and then use Picture.writeImageContent
+ * @see Picture#writeImageContent(java.io.OutputStream)
+ * @return a Picture object if picture exists for specified CharacterRun, null otherwise. PicturesTable.hasPicture is used to determine this.
+ * @see #hasPicture(org.apache.poi.hwpf.usermodel.CharacterRun)
+ */
+ public Picture extractPicture(CharacterRun run, boolean fillBytes) {
+ if (hasPicture(run)) {
+ return new Picture(run.getPicOffset(), _dataStream, fillBytes);
+ }
+ return null;
+ }
+
+ /**
+ * @return a list of Picture objects found in current document
+ */
+ public List getAllPictures() {
+ ArrayList pictures = new ArrayList();
+
+ int pos = 0;
+ boolean atEnd = false;
+
+ while(pos<_dataStream.length && !atEnd) {
+ if (isBlockContainsImage(pos)) {
+ pictures.add(new Picture(pos, _dataStream, false));
+ }
+
+ int skipOn = LittleEndian.getInt(_dataStream, pos);
+ if(skipOn <= 0) { atEnd = true; }
+ pos += skipOn;
+ }
+
+ return pictures;
+ }
+
+ private boolean isBlockContainsImage(int i)
+ {
+ return isPictureRecognized(getBlockType(_dataStream, i), getMmMode(_dataStream, i));
+ }
+
+ private boolean isBlockContainsHorizontalLine(int i)
+ {
+ return getBlockType(_dataStream, i)==TYPE_HORIZONTAL_LINE && getMmMode(_dataStream, i)==0x64;
+ }
+
+}
--- /dev/null
+/* ====================================================================
+ Copyright 2002-2006 Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.util.LittleEndian;
+
+import java.io.OutputStream;
+import java.io.IOException;
+
+/**
+ * Represents embedded picture extracted from Word Document
+ * @author Dmitry Romanov
+ */
+public class Picture
+{
+// public static final int FILENAME_OFFSET = 0x7C;
+// public static final int FILENAME_SIZE_OFFSET = 0x6C;
+ static final int BLOCK_TYPE_OFFSET = 0xE;
+ static final int PICT_HEADER_OFFSET = 0x4;
+ static final int UNKNOWN_HEADER_SIZE = 0x49;
+
+ public static final byte[] GIF = new byte[]{'G', 'I', 'F'};
+ public static final byte[] PNG = new byte[]{ (byte)0x89, 0x50, 0x4E, 0x47,0x0D,0x0A,0x1A,0x0A};
+ public static final byte[] JPG = new byte[]{(byte)0xFF, (byte)0xD8};
+ public static final byte[] BMP = new byte[]{'B', 'M'};
+ public static final byte[] TIFF = new byte[]{0x49, 0x49, 0x2A, 0x00};
+ public static final byte[] TIFF1 = new byte[]{0x4D, 0x4D, 0x00, 0x2A};
+
+ public static final byte[] IHDR = new byte[]{'I', 'H', 'D', 'R'};
+
+ private int dataBlockStartOfsset;
+ private int pictureBytesStartOffset;
+ private int dataBlockSize;
+ private int size;
+// private String fileName;
+ private byte[] content;
+ private byte[] _dataStream;
+ private int aspectRatioX;
+ private int aspectRatioY;
+ private int height = -1;
+ private int width = -1;
+
+
+ public Picture(int dataBlockStartOfsset, byte[] _dataStream, boolean fillBytes)
+ {
+ this._dataStream = _dataStream;
+ this.dataBlockStartOfsset = dataBlockStartOfsset;
+ this.dataBlockSize = LittleEndian.getInt(_dataStream, dataBlockStartOfsset);
+ this.pictureBytesStartOffset = getPictureBytesStartOffset(dataBlockStartOfsset, _dataStream, dataBlockSize);
+ this.size = dataBlockSize - (pictureBytesStartOffset - dataBlockStartOfsset);
+
+ if (size<0) {
+
+ }
+
+ this.aspectRatioX = extractAspectRatioX(_dataStream, dataBlockStartOfsset);
+ this.aspectRatioY = extractAspectRatioY(_dataStream, dataBlockStartOfsset);
+// this.fileName = extractFileName(dataBlockStartOfsset, _dataStream);
+// if (fileName==null || fileName.length()==0) {
+// fileName = "clipboard";
+// }
+
+ if (fillBytes)
+ {
+ fillImageContent(_dataStream);
+ }
+
+ String ext = suggestFileExtension();
+ // trying to extract width and height from pictures content:
+ if ("jpg".equalsIgnoreCase(ext)) {
+ fillJPGWidthHeight();
+ } else if ("png".equalsIgnoreCase(ext)) {
+ fillPNGWidthHeight();
+ }
+ }
+
+ private static int extractAspectRatioX(byte[] _dataStream, int dataBlockStartOffset)
+ {
+ return LittleEndian.getShort(_dataStream, dataBlockStartOffset+0x20)/10;
+ }
+
+ private static int extractAspectRatioY(byte[] _dataStream, int dataBlockStartOffset)
+ {
+ return LittleEndian.getShort(_dataStream, dataBlockStartOffset+0x22)/10;
+ }
+
+ /**
+ * Tries to suggest a filename: hex representation of picture structure offset in "Data" stream plus extension that
+ * is tried to determine from first byte of picture's content.
+ *
+ * @return suggested file name
+ */
+ public String suggestFullFileName()
+ {
+ String fileExt = suggestFileExtension();
+ return Integer.toHexString(dataBlockStartOfsset) + (fileExt.length()>0 ? "."+fileExt : "");
+ }
+
+ /**
+ * Writes Picture's content bytes to specified OutputStream.
+ * Is useful when there is need to write picture bytes directly to stream, omitting its representation in
+ * memory as distinct byte array.
+ *
+ * @param out a stream to write to
+ * @throws IOException if some exception is occured while writing to specified out
+ */
+ public void writeImageContent(OutputStream out) throws IOException
+ {
+ if (content!=null && content.length>0) {
+ out.write(content, 0, size);
+ } else {
+ out.write(_dataStream, pictureBytesStartOffset, size);
+ }
+ }
+
+ /**
+ * @return picture's content as byte array
+ */
+ public byte[] getContent()
+ {
+ if (content == null || content.length<=0)
+ {
+ fillImageContent(this._dataStream);
+ }
+ return content;
+ }
+
+ /**
+ *
+ * @return size in bytes of the picture
+ */
+ public int getSize()
+ {
+ return size;
+ }
+
+ /**
+ * returns horizontal aspect ratio for picture provided by user
+ */
+ public int getAspectRatioX()
+ {
+ return aspectRatioX;
+ }
+ /**
+ * returns vertical aspect ratio for picture provided by user
+ */
+ public int getAspectRatioY()
+ {
+ return aspectRatioY;
+ }
+
+ /**
+ * tries to suggest extension for picture's file by matching signatures of popular image formats to first bytes
+ * of picture's contents
+ * @return suggested file extension
+ */
+ public String suggestFileExtension()
+ {
+ if (content!=null && content.length>0) {
+ return suggestFileExtension(content, 0);
+ }
+ return suggestFileExtension(_dataStream, pictureBytesStartOffset);
+ }
+
+
+ private String suggestFileExtension(byte[] _dataStream, int pictureBytesStartOffset)
+ {
+ if (matchSignature(_dataStream, JPG, pictureBytesStartOffset)) {
+ return "jpg";
+ } else if (matchSignature(_dataStream, PNG, pictureBytesStartOffset)) {
+ return "png";
+ } else if (matchSignature(_dataStream, GIF, pictureBytesStartOffset)) {
+ return "gif";
+ } else if (matchSignature(_dataStream, BMP, pictureBytesStartOffset)) {
+ return "bmp";
+ } else if (matchSignature(_dataStream, TIFF, pictureBytesStartOffset)) {
+ return "tiff";
+ } else if (matchSignature(_dataStream, TIFF1, pictureBytesStartOffset)) {
+ return "tiff";
+ }
+ return "";
+ }
+
+ private static boolean matchSignature(byte[] dataStream, byte[] signature, int pictureBytesOffset)
+ {
+ boolean matched = pictureBytesOffset < dataStream.length;
+ for (int i = 0; (i+pictureBytesOffset) < dataStream.length && i < signature.length; i++)
+ {
+ if (dataStream[i+pictureBytesOffset] != signature[i])
+ {
+ matched = false;
+ break;
+ }
+ }
+ return matched;
+ }
+
+// public String getFileName()
+// {
+// return fileName;
+// }
+
+// private static String extractFileName(int blockStartIndex, byte[] dataStream) {
+// int fileNameStartOffset = blockStartIndex + 0x7C;
+// int fileNameSizeOffset = blockStartIndex + FILENAME_SIZE_OFFSET;
+// int fileNameSize = LittleEndian.getShort(dataStream, fileNameSizeOffset);
+//
+// int fileNameIndex = fileNameStartOffset;
+// char[] fileNameChars = new char[(fileNameSize-1)/2];
+// int charIndex = 0;
+// while(charIndex<fileNameChars.length) {
+// short aChar = LittleEndian.getShort(dataStream, fileNameIndex);
+// fileNameChars[charIndex] = (char)aChar;
+// charIndex++;
+// fileNameIndex += 2;
+// }
+// String fileName = new String(fileNameChars);
+// return fileName.trim();
+// }
+
+ private void fillImageContent(byte[] dataStream)
+ {
+ this.content = new byte[size];
+ System.arraycopy(dataStream, pictureBytesStartOffset, content, 0, size);
+ }
+
+ private static int getPictureBytesStartOffset(int dataBlockStartOffset, byte[] _dataStream, int dataBlockSize)
+ {
+ final int dataBlockEndOffset = dataBlockSize + dataBlockStartOffset;
+ int realPicoffset = dataBlockStartOffset;
+
+ int PICTFBlockSize = LittleEndian.getShort(_dataStream, dataBlockStartOffset +PICT_HEADER_OFFSET);
+ int PICTF1BlockOffset = PICTFBlockSize + PICT_HEADER_OFFSET;
+ int PICTF1BlockSize = LittleEndian.getShort(_dataStream, dataBlockStartOffset +PICTF1BlockOffset);
+
+ int unknownHeaderOffset = (PICTF1BlockSize + PICTF1BlockOffset) < dataBlockEndOffset ? (PICTF1BlockSize + PICTF1BlockOffset) : PICTF1BlockOffset;
+ realPicoffset += (unknownHeaderOffset + UNKNOWN_HEADER_SIZE);
+ if (realPicoffset>=dataBlockEndOffset) {
+ realPicoffset -= UNKNOWN_HEADER_SIZE;
+ }
+ return realPicoffset;
+ }
+
+ private void fillJPGWidthHeight() {
+ /*
+ http://www.codecomments.com/archive281-2004-3-158083.html
+
+ Algorhitm proposed by Patrick TJ McPhee:
+
+ read 2 bytes
+ make sure they are 'ffd8'x
+ repeatedly:
+ read 2 bytes
+ make sure the first one is 'ff'x
+ if the second one is 'd9'x stop
+ else if the second one is c0 or c2 (or possibly other values ...)
+ skip 2 bytes
+ read one byte into depth
+ read two bytes into height
+ read two bytes into width
+ else
+ read two bytes into length
+ skip forward length-2 bytes
+
+ Also used Ruby code snippet from: http://www.bigbold.com/snippets/posts/show/805 for reference
+ */
+ int pointer = pictureBytesStartOffset+2;
+ int firstByte = _dataStream[pointer];
+ int secondByte = _dataStream[pointer+1];
+
+ int endOfPicture = pictureBytesStartOffset + size;
+ while(pointer<endOfPicture-1) {
+ do {
+ firstByte = _dataStream[pointer];
+ secondByte = _dataStream[pointer+1];
+ } while (!(firstByte==(byte)0xFF) && pointer<endOfPicture-1);
+
+ if (firstByte==((byte)0xFF) && pointer<endOfPicture-1) {
+ if (secondByte==(byte)0xD9 || secondByte==(byte)0xDA) {
+ break;
+ } else if ( (secondByte & 0xF0) == 0xC0 && secondByte!=(byte)0xC4 && secondByte!=(byte)0xC8 && secondByte!=(byte)0xCC) {
+ pointer += 5;
+ this.height = getBigEndianShort(_dataStream, pointer);
+ this.width = getBigEndianShort(_dataStream, pointer+2);
+ break;
+ } else {
+ pointer++;
+ pointer++;
+ int length = getBigEndianShort(_dataStream, pointer);
+ pointer+=length;
+ }
+ } else {
+ pointer++;
+ }
+ }
+ }
+
+ private void fillPNGWidthHeight()
+ {
+ /*
+ Used PNG file format description from http://www.wotsit.org/download.asp?f=png
+ */
+ int HEADER_START = pictureBytesStartOffset + PNG.length + 4;
+ if (matchSignature(_dataStream, IHDR, HEADER_START)) {
+ int IHDR_CHUNK_WIDTH = HEADER_START + 4;
+ this.width = getBigEndianInt(_dataStream, IHDR_CHUNK_WIDTH);
+ this.height = getBigEndianInt(_dataStream, IHDR_CHUNK_WIDTH + 4);
+ }
+ }
+ /**
+ * returns pixel width of the picture or -1 if dimensions determining was failed
+ */
+ public int getWidth()
+ {
+ return width;
+ }
+ /**
+ * returns pixel height of the picture or -1 if dimensions determining was failed
+ */
+ public int getHeight()
+ {
+ return height;
+ }
+
+ private static int getBigEndianInt(byte[] data, int offset)
+ {
+ return (((data[offset] & 0xFF)<< 24) + ((data[offset +1] & 0xFF) << 16) + ((data[offset + 2] & 0xFF) << 8) + (data[offset +3] & 0xFF));
+ }
+
+ private static int getBigEndianShort(byte[] data, int offset)
+ {
+ return (((data[offset] & 0xFF)<< 8) + (data[offset +1] & 0xFF));
+ }
+
+}
--- /dev/null
+
+/* ====================================================================
+ Copyright 2002-2004 Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf;
+
+import java.io.ByteArrayOutputStream;
+import java.io.FileInputStream;
+import java.util.List;
+
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.usermodel.Picture;
+
+import junit.framework.TestCase;
+
+/**
+ * Test picture support in HWPF
+ * @author nick
+ */
+public class TestHWPFPictures extends TestCase {
+ private HWPFDocument docA;
+ private HWPFDocument docB;
+ private String docAFile;
+ private String docBFile;
+
+ private String imgAFile;
+ private String imgBFile;
+
+ protected void setUp() throws Exception {
+ String dirname = System.getProperty("HWPF.testdata.path");
+
+ docAFile = dirname + "/testPictures.doc";
+ docBFile = dirname + "/two_images.doc";
+
+ imgAFile = dirname + "/simple_image.jpg";
+ imgBFile = dirname + "/simple_image.png";
+ }
+
+ /**
+ * Test just opening the files
+ */
+ public void testOpen() throws Exception {
+ docA = new HWPFDocument(new FileInputStream(docAFile));
+ docB = new HWPFDocument(new FileInputStream(docBFile));
+ }
+
+ /**
+ * Test that we have the right numbers of images in each file
+ */
+ public void testImageCount() throws Exception {
+ docA = new HWPFDocument(new FileInputStream(docAFile));
+ docB = new HWPFDocument(new FileInputStream(docBFile));
+
+ assertNotNull(docA.getPicturesTable());
+ assertNotNull(docB.getPicturesTable());
+
+ PicturesTable picA = docA.getPicturesTable();
+ PicturesTable picB = docB.getPicturesTable();
+
+ List picturesA = picA.getAllPictures();
+ List picturesB = picB.getAllPictures();
+
+ assertEquals(7, picturesA.size());
+ assertEquals(2, picturesB.size());
+ }
+
+ /**
+ * Test that we have the right images in at least one file
+ */
+ public void testImageData() throws Exception {
+ docB = new HWPFDocument(new FileInputStream(docBFile));
+ PicturesTable picB = docB.getPicturesTable();
+ List picturesB = picB.getAllPictures();
+
+ assertEquals(2, picturesB.size());
+
+ Picture pic1 = (Picture)picturesB.get(0);
+ Picture pic2 = (Picture)picturesB.get(1);
+
+ assertNotNull(pic1);
+ assertNotNull(pic2);
+
+ // Check the same
+ byte[] pic1B = readFile(imgAFile);
+ byte[] pic2B = readFile(imgBFile);
+
+ assertEquals(pic1B.length, pic1.getContent().length);
+ assertEquals(pic2B.length, pic2.getContent().length);
+
+ assertBytesSame(pic1B, pic1.getContent());
+ assertBytesSame(pic2B, pic2.getContent());
+ }
+
+
+ private void assertBytesSame(byte[] a, byte[] b) {
+ assertEquals(a.length, b.length);
+ for(int i=0; i<a.length; i++) {
+ assertEquals(a[i],b[i]);
+ }
+ }
+
+ private byte[] readFile(String file) throws Exception {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ FileInputStream fis = new FileInputStream(file);
+ byte[] buffer = new byte[1024];
+
+ int read = 0;
+ while(read > -1) {
+ read = fis.read(buffer);
+ if(read > 0) {
+ baos.write(buffer,0,read);
+ }
+ }
+
+ return baos.toByteArray();
+ }
+}