You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PPTXMLDump.java 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hslf.dev;
  16. import java.io.ByteArrayOutputStream;
  17. import java.io.File;
  18. import java.io.FileOutputStream;
  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.io.OutputStreamWriter;
  22. import java.io.StringWriter;
  23. import java.io.Writer;
  24. import java.nio.charset.StandardCharsets;
  25. import java.util.Arrays;
  26. import org.apache.poi.hslf.record.RecordTypes;
  27. import org.apache.poi.hslf.usermodel.HSLFSlideShow;
  28. import org.apache.poi.poifs.filesystem.DirectoryNode;
  29. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  30. import org.apache.poi.util.IOUtils;
  31. import org.apache.poi.util.LittleEndian;
  32. import org.apache.poi.util.LittleEndianConsts;
  33. /**
  34. * Utility class which dumps raw contents of a ppt file into XML format
  35. */
  36. public final class PPTXMLDump {
  37. //arbitrarily selected; may need to increase
  38. private static final int MAX_RECORD_LENGTH = 1_000_000;
  39. private static final int HEADER_SIZE = 8; //size of the record header
  40. private static final int PICT_HEADER_SIZE = 25; //size of the picture header
  41. private static final String PICTURES_ENTRY = "Pictures";
  42. private static final String CR = System.getProperty("line.separator");
  43. private Writer out;
  44. private byte[] docstream;
  45. private byte[] pictstream;
  46. private boolean hexHeader = true;
  47. public PPTXMLDump(File ppt) throws IOException {
  48. try (POIFSFileSystem fs = new POIFSFileSystem(ppt, true)) {
  49. docstream = readEntry(fs, HSLFSlideShow.POWERPOINT_DOCUMENT);
  50. pictstream = readEntry(fs, PICTURES_ENTRY);
  51. }
  52. }
  53. private static byte[] readEntry(POIFSFileSystem fs, String entry)
  54. throws IOException {
  55. DirectoryNode dn = fs.getRoot();
  56. if (!dn.hasEntry(entry)) {
  57. return null;
  58. }
  59. try (InputStream is = dn.createDocumentInputStream(entry)) {
  60. ByteArrayOutputStream bos = new ByteArrayOutputStream();
  61. IOUtils.copy(is, bos);
  62. return bos.toByteArray();
  63. }
  64. }
  65. /**
  66. * Dump the structure of the supplied PPT file into XML
  67. * @param outWriter <code>Writer</code> to write out
  68. * @throws java.io.IOException If writing to the writer fails
  69. */
  70. public void dump(Writer outWriter) throws IOException {
  71. this.out = outWriter;
  72. int padding = 0;
  73. write(out, "<Presentation>" + CR, padding);
  74. padding++;
  75. if (pictstream != null){
  76. write(out, "<Pictures>" + CR, padding);
  77. dumpPictures(pictstream, padding);
  78. write(out, "</Pictures>" + CR, padding);
  79. }
  80. //dump the structure of the powerpoint document
  81. write(out, "<PowerPointDocument>" + CR, padding);
  82. padding++;
  83. if(docstream != null) {
  84. dump(docstream, 0, docstream.length, padding);
  85. }
  86. padding--;
  87. write(out, "</PowerPointDocument>" + CR, padding);
  88. padding--;
  89. write(out, "</Presentation>", padding);
  90. }
  91. /**
  92. * Dump a part of the document stream into XML
  93. * @param data PPT binary data
  94. * @param offset offset from the beginning of the document
  95. * @param length of the document
  96. * @param padding used for formatting results
  97. * @throws java.io.IOException If writing out information fails
  98. */
  99. public void dump(byte[] data, int offset, int length, int padding) throws IOException {
  100. int pos = offset;
  101. while (pos <= (offset + length - HEADER_SIZE)){
  102. if (pos < 0) {
  103. break;
  104. }
  105. //read record header
  106. int info = LittleEndian.getUShort(data, pos);
  107. pos += LittleEndianConsts.SHORT_SIZE;
  108. int type = LittleEndian.getUShort(data, pos);
  109. pos += LittleEndianConsts.SHORT_SIZE;
  110. int size = (int)LittleEndian.getUInt(data, pos);
  111. pos += LittleEndianConsts.INT_SIZE;
  112. //get name of the record by type
  113. String recname = RecordTypes.forTypeID(type).name();
  114. write(out, "<"+recname + " info=\""+info+"\" type=\""+type+"\" size=\""+size+"\" offset=\""+(pos-8)+"\"", padding);
  115. if (hexHeader){
  116. out.write(" header=\"");
  117. dump(out, data, pos-8, 8, 0, false);
  118. out.write("\"");
  119. }
  120. out.write(">" + CR);
  121. padding++;
  122. //this check works both for Escher and PowerPoint records
  123. boolean isContainer = (info & 0x000F) == 0x000F;
  124. if (isContainer) {
  125. //continue to dump child records
  126. dump(data, pos, size, padding);
  127. } else {
  128. //dump first 100 bytes of the atom data
  129. dump(out, data, pos, Math.min(size, data.length-pos), padding, true);
  130. }
  131. padding--;
  132. write(out, "</"+recname + ">" + CR, padding);
  133. pos += size;
  134. }
  135. }
  136. /**
  137. * Dumps the Pictures OLE stream into XML.
  138. *
  139. * @param data from the Pictures OLE data stream
  140. * @param padding How many leading blanks to add in the output
  141. * @throws java.io.IOException If writing out information fails
  142. */
  143. public void dumpPictures(byte[] data, int padding) throws IOException {
  144. int pos = 0;
  145. while (pos < data.length) {
  146. if(data.length - pos < PICT_HEADER_SIZE) {
  147. // corrupt file, cannot read header
  148. return;
  149. }
  150. byte[] header = Arrays.copyOfRange(data, pos, pos + PICT_HEADER_SIZE);
  151. int size = LittleEndian.getInt(header, 4) - 17;
  152. if(size < 0) {
  153. // corrupt file, negative image size
  154. return;
  155. }
  156. pos += PICT_HEADER_SIZE + size;
  157. padding++;
  158. write(out, "<picture size=\""+size+"\" type=\""+getPictureType(header)+"\">" + CR, padding);
  159. padding++;
  160. write(out, "<header>" + CR, padding);
  161. dump(out, header, 0, header.length, padding, true);
  162. write(out, "</header>" + CR, padding);
  163. write(out, "<imgdata>" + CR, padding);
  164. dump(out, data, 0, Math.min(size, 100), padding, true);
  165. write(out, "</imgdata>" + CR, padding);
  166. padding--;
  167. write(out, "</picture>" + CR, padding);
  168. padding--;
  169. }
  170. }
  171. public static void main(String[] args) throws Exception {
  172. if (args.length == 0){
  173. System.out.println(
  174. "Usage: PPTXMLDump (options) pptfile\n" +
  175. "Where options include:\n" +
  176. " -f write output to <pptfile>.xml file in the current directory"
  177. );
  178. return;
  179. }
  180. boolean outFile = false;
  181. for (String arg : args) {
  182. if (arg.startsWith("-")) {
  183. if ("-f".equals(arg)) {
  184. //write ouput to a file
  185. outFile = true;
  186. }
  187. } else {
  188. File ppt = new File(arg);
  189. PPTXMLDump dump = new PPTXMLDump(ppt);
  190. System.out.println("Dumping " + arg);
  191. if (outFile) {
  192. FileOutputStream fos = new FileOutputStream(ppt.getName() + ".xml");
  193. OutputStreamWriter out = new OutputStreamWriter(fos, StandardCharsets.UTF_8);
  194. dump.dump(out);
  195. out.close();
  196. } else {
  197. StringWriter out = new StringWriter();
  198. dump.dump(out);
  199. System.out.println(out);
  200. }
  201. }
  202. }
  203. }
  204. /**
  205. * write a string to <code>out</code> with the specified padding
  206. */
  207. private static void write(Writer out, String str, int padding) throws IOException {
  208. for (int i = 0; i < padding; i++) out.write(" ");
  209. out.write(str);
  210. }
  211. private String getPictureType(byte[] header){
  212. String type;
  213. int meta = LittleEndian.getUShort(header, 0);
  214. switch(meta){
  215. case 0x46A0: type = "jpeg"; break;
  216. case 0x2160: type = "wmf"; break;
  217. case 0x6E00: type = "png"; break;
  218. default: type = "unknown"; break;
  219. }
  220. return type;
  221. }
  222. /**
  223. * dump binary data to <code>out</code> with the specified padding
  224. */
  225. private static void dump(Writer out, byte[] data, int offset, int length, int padding, boolean nl) throws IOException {
  226. int linesize = 25;
  227. for (int i = 0; i < padding; i++) out.write(" ");
  228. int i;
  229. for (i = offset; i < (offset + length); i++) {
  230. int c = data[i];
  231. out.write((char) hexval[(c & 0xF0) >> 4]);
  232. out.write((char) hexval[(c & 0x0F) >> 0]);
  233. out.write(' ');
  234. if((i+1-offset) % linesize == 0 && i != (offset + length-1)) {
  235. out.write(CR);
  236. for (int j = 0; j < padding; j++) out.write(" ");
  237. }
  238. }
  239. if(nl && length > 0)out.write(CR);
  240. }
  241. private static final byte[] hexval =
  242. {(byte) '0', (byte) '1', (byte) '2', (byte) '3',
  243. (byte) '4', (byte) '5', (byte) '6', (byte) '7',
  244. (byte) '8', (byte) '9', (byte) 'A', (byte) 'B',
  245. (byte) 'C', (byte) 'D', (byte) 'E', (byte) 'F'};
  246. }