You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

HWPFDocument.java 37KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf;
  16. import java.io.ByteArrayInputStream;
  17. import java.io.ByteArrayOutputStream;
  18. import java.io.File;
  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.io.OutputStream;
  22. import java.security.GeneralSecurityException;
  23. import org.apache.poi.hpsf.DocumentSummaryInformation;
  24. import org.apache.poi.hpsf.SummaryInformation;
  25. import org.apache.poi.hwpf.model.BookmarksTables;
  26. import org.apache.poi.hwpf.model.CHPBinTable;
  27. import org.apache.poi.hwpf.model.ComplexFileTable;
  28. import org.apache.poi.hwpf.model.DocumentProperties;
  29. import org.apache.poi.hwpf.model.FSPADocumentPart;
  30. import org.apache.poi.hwpf.model.FSPATable;
  31. import org.apache.poi.hwpf.model.FieldsTables;
  32. import org.apache.poi.hwpf.model.FontTable;
  33. import org.apache.poi.hwpf.model.ListTables;
  34. import org.apache.poi.hwpf.model.NoteType;
  35. import org.apache.poi.hwpf.model.NotesTables;
  36. import org.apache.poi.hwpf.model.OfficeArtContent;
  37. import org.apache.poi.hwpf.model.PAPBinTable;
  38. import org.apache.poi.hwpf.model.PicturesTable;
  39. import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
  40. import org.apache.poi.hwpf.model.SavedByTable;
  41. import org.apache.poi.hwpf.model.SectionTable;
  42. import org.apache.poi.hwpf.model.SinglentonTextPiece;
  43. import org.apache.poi.hwpf.model.StyleSheet;
  44. import org.apache.poi.hwpf.model.SubdocumentType;
  45. import org.apache.poi.hwpf.model.TextPiece;
  46. import org.apache.poi.hwpf.model.TextPieceTable;
  47. import org.apache.poi.hwpf.model.io.HWPFFileSystem;
  48. import org.apache.poi.hwpf.usermodel.Bookmarks;
  49. import org.apache.poi.hwpf.usermodel.BookmarksImpl;
  50. import org.apache.poi.hwpf.usermodel.Field;
  51. import org.apache.poi.hwpf.usermodel.Fields;
  52. import org.apache.poi.hwpf.usermodel.FieldsImpl;
  53. import org.apache.poi.hwpf.usermodel.HWPFList;
  54. import org.apache.poi.hwpf.usermodel.Notes;
  55. import org.apache.poi.hwpf.usermodel.NotesImpl;
  56. import org.apache.poi.hwpf.usermodel.OfficeDrawings;
  57. import org.apache.poi.hwpf.usermodel.OfficeDrawingsImpl;
  58. import org.apache.poi.hwpf.usermodel.Range;
  59. import org.apache.poi.poifs.common.POIFSConstants;
  60. import org.apache.poi.poifs.crypt.ChunkedCipherOutputStream;
  61. import org.apache.poi.poifs.crypt.EncryptionInfo;
  62. import org.apache.poi.poifs.crypt.EncryptionMode;
  63. import org.apache.poi.poifs.crypt.Encryptor;
  64. import org.apache.poi.poifs.crypt.standard.EncryptionRecord;
  65. import org.apache.poi.poifs.filesystem.DirectoryNode;
  66. import org.apache.poi.poifs.filesystem.Entry;
  67. import org.apache.poi.poifs.filesystem.EntryUtils;
  68. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  69. import org.apache.poi.util.IOUtils;
  70. import org.apache.poi.util.Internal;
  71. import org.apache.poi.util.LittleEndianByteArrayOutputStream;
  72. /**
  73. * This class acts as the bucket that we throw all of the Word data structures
  74. * into.
  75. */
  76. public final class HWPFDocument extends HWPFDocumentCore {
  77. /*package*/ static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
  78. private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
  79. //arbitrarily selected; may need to increase
  80. private static final int DEFAULT_MAX_RECORD_LENGTH = 100_000;
  81. private static int MAX_RECORD_LENGTH = DEFAULT_MAX_RECORD_LENGTH;
  82. private static final String STREAM_DATA = "Data";
  83. /**
  84. * table stream buffer
  85. */
  86. private byte[] _tableStream;
  87. /**
  88. * data stream buffer
  89. */
  90. private byte[] _dataStream;
  91. /**
  92. * Document wide Properties
  93. */
  94. private DocumentProperties _dop;
  95. /**
  96. * Contains text of the document wrapped in a obfuscated Word data
  97. * structure
  98. */
  99. private ComplexFileTable _cft;
  100. /**
  101. * Contains text buffer linked directly to single-piece document text piece
  102. */
  103. private StringBuilder _text;
  104. /**
  105. * Holds the save history for this document.
  106. */
  107. private SavedByTable _sbt;
  108. /**
  109. * Holds the revision mark authors for this document.
  110. */
  111. private RevisionMarkAuthorTable _rmat;
  112. /**
  113. * Holds FSBA (shape) information
  114. */
  115. private FSPATable _fspaHeaders;
  116. /**
  117. * Holds FSBA (shape) information
  118. */
  119. private FSPATable _fspaMain;
  120. /**
  121. * Office Art (Escher records) information
  122. */
  123. private final OfficeArtContent officeArtContent;
  124. /**
  125. * Holds pictures table
  126. */
  127. private PicturesTable _pictures;
  128. /**
  129. * Holds Office Art objects
  130. */
  131. private OfficeDrawingsImpl _officeDrawingsHeaders;
  132. /**
  133. * Holds Office Art objects
  134. */
  135. private OfficeDrawingsImpl _officeDrawingsMain;
  136. /**
  137. * Holds the bookmarks tables
  138. */
  139. private BookmarksTables _bookmarksTables;
  140. /**
  141. * Holds the bookmarks
  142. */
  143. private Bookmarks _bookmarks;
  144. /**
  145. * Holds the ending notes tables
  146. */
  147. private NotesTables _endnotesTables = new NotesTables(NoteType.ENDNOTE);
  148. /**
  149. * Holds the footnotes
  150. */
  151. private Notes _endnotes = new NotesImpl(_endnotesTables);
  152. /**
  153. * Holds the footnotes tables
  154. */
  155. private NotesTables _footnotesTables = new NotesTables(NoteType.FOOTNOTE);
  156. /**
  157. * Holds the footnotes
  158. */
  159. private Notes _footnotes = new NotesImpl(_footnotesTables);
  160. /**
  161. * Holds the fields PLCFs
  162. */
  163. private FieldsTables _fieldsTables;
  164. /**
  165. * Holds the fields
  166. */
  167. private Fields _fields;
  168. /**
  169. * @param length the max record length allowed for HWPFDocument
  170. */
  171. public static void setMaxRecordLength(int length) {
  172. MAX_RECORD_LENGTH = length;
  173. }
  174. /**
  175. * @return the max record length allowed for HWPFDocument
  176. */
  177. public static int getMaxRecordLength() {
  178. return MAX_RECORD_LENGTH;
  179. }
  180. /**
  181. * This constructor loads a Word document from an InputStream.
  182. *
  183. * @param istream The InputStream that contains the Word document.
  184. * @throws IOException If there is an unexpected IOException from the passed
  185. * in InputStream.
  186. * @throws org.apache.poi.EmptyFileException If the given stream is empty
  187. * @throws RuntimeException a number of other runtime exceptions can be thrown, especially if there are problems with the
  188. * input format
  189. */
  190. public HWPFDocument(InputStream istream) throws IOException {
  191. //do Ole stuff
  192. this(verifyAndBuildPOIFS(istream));
  193. }
  194. /**
  195. * This constructor loads a Word document from a POIFSFileSystem
  196. *
  197. * @param pfilesystem The POIFSFileSystem that contains the Word document.
  198. * @throws IOException If there is an unexpected IOException from the passed
  199. * in POIFSFileSystem.
  200. * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the
  201. * input format
  202. */
  203. public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException {
  204. this(pfilesystem.getRoot());
  205. }
  206. /**
  207. * This constructor loads a Word document from a specific point
  208. * in a POIFSFileSystem, probably not the default.
  209. * Used typically to open embedded documents.
  210. *
  211. * @param directory The DirectoryNode that contains the Word document.
  212. * @throws IOException If there is an unexpected IOException from the passed
  213. * in POIFSFileSystem.
  214. * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the
  215. * input format
  216. */
  217. public HWPFDocument(DirectoryNode directory) throws IOException {
  218. // Load the main stream and FIB
  219. // Also handles HPSF bits
  220. super(directory);
  221. // Is this document too old for us?
  222. if (_fib.getFibBase().getNFib() < 106) {
  223. throw new OldWordFileFormatException("The document is too old - Word 95 or older. Try HWPFOldDocument instead?");
  224. }
  225. // use the fib to determine the name of the table stream.
  226. String name = (_fib.getFibBase().isFWhichTblStm()) ? STREAM_TABLE_1 : STREAM_TABLE_0;
  227. // Grab the table stream.
  228. if (!directory.hasEntry(name)) {
  229. throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)");
  230. }
  231. // read in the table stream.
  232. _tableStream = getDocumentEntryBytes(name, _fib.getFibBase().getLKey(), Integer.MAX_VALUE);
  233. _fib.fillVariableFields(_mainStream, _tableStream);
  234. // read in the data stream.
  235. _dataStream = directory.hasEntry(STREAM_DATA) ? getDocumentEntryBytes(STREAM_DATA, 0, Integer.MAX_VALUE) : new byte[0];
  236. // Get the cp of the start of text in the main stream
  237. // The latest spec doc says this is always zero!
  238. int fcMin = 0;
  239. //fcMin = _fib.getFcMin()
  240. // Start to load up our standard structures.
  241. _dop = new DocumentProperties(_tableStream, _fib.getFcDop(), _fib.getLcbDop());
  242. _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin);
  243. TextPieceTable _tpt = _cft.getTextPieceTable();
  244. // Now load the rest of the properties, which need to be adjusted
  245. // for where text really begin
  246. _cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt);
  247. _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);
  248. _text = _tpt.getText();
  249. /*
  250. * in this mode we preserving PAPX/CHPX structure from file, so text may
  251. * miss from output, and text order may be corrupted
  252. */
  253. boolean preserveBinTables = false;
  254. try {
  255. preserveBinTables = Boolean.parseBoolean(System.getProperty(PROPERTY_PRESERVE_BIN_TABLES));
  256. } catch (Exception exc) {
  257. // ignore;
  258. }
  259. if (!preserveBinTables) {
  260. _cbt.rebuild(_cft);
  261. _pbt.rebuild(_text, _cft);
  262. }
  263. /*
  264. * Property to disable text rebuilding. In this mode changing the text
  265. * will lead to unpredictable behavior
  266. */
  267. boolean preserveTextTable = false;
  268. try {
  269. preserveTextTable = Boolean.parseBoolean(System.getProperty(PROPERTY_PRESERVE_TEXT_TABLE));
  270. } catch (Exception exc) {
  271. // ignore;
  272. }
  273. if (!preserveTextTable) {
  274. _cft = new ComplexFileTable();
  275. _tpt = _cft.getTextPieceTable();
  276. final TextPiece textPiece = new SinglentonTextPiece(_text);
  277. _tpt.add(textPiece);
  278. _text = textPiece.getStringBuilder();
  279. }
  280. // Read FSPA and Escher information
  281. // _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(),
  282. // _fib.getLcbPlcspaMom(), getTextTable().getTextPieces());
  283. _fspaHeaders = new FSPATable(_tableStream, _fib,
  284. FSPADocumentPart.HEADER);
  285. _fspaMain = new FSPATable(_tableStream, _fib, FSPADocumentPart.MAIN);
  286. officeArtContent = new OfficeArtContent(_tableStream, _fib.getFcDggInfo(), _fib.getLcbDggInfo());
  287. // read in the pictures stream
  288. _pictures = new PicturesTable(this, _dataStream, _mainStream, _fspaMain, officeArtContent);
  289. // And escher pictures
  290. _officeDrawingsHeaders = new OfficeDrawingsImpl(_fspaHeaders, officeArtContent, _mainStream);
  291. _officeDrawingsMain = new OfficeDrawingsImpl(_fspaMain, officeArtContent, _mainStream);
  292. _st = new SectionTable(_mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, _tpt, _fib.getSubdocumentTextStreamLength(SubdocumentType.MAIN));
  293. _ss = new StyleSheet(_tableStream, _fib.getFcStshf());
  294. _ft = new FontTable(_tableStream, _fib.getFcSttbfffn(), _fib.getLcbSttbfffn());
  295. int listOffset = _fib.getFcPlfLst();
  296. // int lfoOffset = _fib.getFcPlfLfo();
  297. if (listOffset != 0 && _fib.getLcbPlfLst() != 0) {
  298. _lt = new ListTables(_tableStream, listOffset, _fib.getFcPlfLfo(),
  299. _fib.getLcbPlfLfo());
  300. }
  301. int sbtOffset = _fib.getFcSttbSavedBy();
  302. int sbtLength = _fib.getLcbSttbSavedBy();
  303. if (sbtOffset != 0 && sbtLength != 0) {
  304. _sbt = new SavedByTable(_tableStream, sbtOffset, sbtLength);
  305. }
  306. int rmarkOffset = _fib.getFcSttbfRMark();
  307. int rmarkLength = _fib.getLcbSttbfRMark();
  308. if (rmarkOffset != 0 && rmarkLength != 0) {
  309. _rmat = new RevisionMarkAuthorTable(_tableStream, rmarkOffset, rmarkLength);
  310. }
  311. _bookmarksTables = new BookmarksTables(_tableStream, _fib);
  312. _bookmarks = new BookmarksImpl(_bookmarksTables);
  313. _endnotesTables = new NotesTables(NoteType.ENDNOTE, _tableStream, _fib);
  314. _endnotes = new NotesImpl(_endnotesTables);
  315. _footnotesTables = new NotesTables(NoteType.FOOTNOTE, _tableStream, _fib);
  316. _footnotes = new NotesImpl(_footnotesTables);
  317. _fieldsTables = new FieldsTables(_tableStream, _fib);
  318. _fields = new FieldsImpl(_fieldsTables);
  319. }
  320. @Override
  321. @Internal
  322. public TextPieceTable getTextTable() {
  323. return _cft.getTextPieceTable();
  324. }
  325. @Internal
  326. @Override
  327. public StringBuilder getText() {
  328. return _text;
  329. }
  330. public DocumentProperties getDocProperties() {
  331. return _dop;
  332. }
  333. @Override
  334. public Range getOverallRange() {
  335. return new Range(0, _text.length(), this);
  336. }
  337. /**
  338. * Returns the range which covers the whole of the document, but excludes
  339. * any headers and footers.
  340. */
  341. @Override
  342. public Range getRange() {
  343. // // First up, trigger a full-recalculate
  344. // // Needed in case of deletes etc
  345. // getOverallRange();
  346. //
  347. // if ( getFileInformationBlock().isFComplex() )
  348. // {
  349. // /*
  350. // * Page 31:
  351. // *
  352. // * main document must be found by examining the piece table entries
  353. // * from the 0th piece table entry from the piece table entry that
  354. // * describes cp=fib.ccpText.
  355. // */
  356. // // TODO: review
  357. // return new Range( _cpSplit.getMainDocumentStart(),
  358. // _cpSplit.getMainDocumentEnd(), this );
  359. // }
  360. //
  361. // /*
  362. // * Page 31:
  363. // *
  364. // * "In a non-complex file, this means text of the: main document
  365. // begins
  366. // * at fib.fcMin in the file and continues through
  367. // * fib.fcMin+fib.ccpText."
  368. // */
  369. // int bytesStart = getFileInformationBlock().getFcMin();
  370. //
  371. // int charsStart = getTextTable().getCharIndex( bytesStart );
  372. // int charsEnd = charsStart
  373. // + getFileInformationBlock().getSubdocumentTextStreamLength(
  374. // SubdocumentType.MAIN );
  375. // it seems much simpler -- sergey
  376. return getRange(SubdocumentType.MAIN);
  377. }
  378. private Range getRange(SubdocumentType subdocument) {
  379. int startCp = 0;
  380. for (SubdocumentType previos : SubdocumentType.ORDERED) {
  381. int length = getFileInformationBlock()
  382. .getSubdocumentTextStreamLength(previos);
  383. if (subdocument == previos) {
  384. return new Range(startCp, startCp + length, this);
  385. }
  386. startCp += length;
  387. }
  388. throw new UnsupportedOperationException(
  389. "Subdocument type not supported: " + subdocument);
  390. }
  391. /**
  392. * Returns the {@link Range} which covers all the Footnotes.
  393. *
  394. * @return the {@link Range} which covers all the Footnotes.
  395. */
  396. public Range getFootnoteRange() {
  397. return getRange(SubdocumentType.FOOTNOTE);
  398. }
  399. /**
  400. * Returns the {@link Range} which covers all endnotes.
  401. *
  402. * @return the {@link Range} which covers all endnotes.
  403. */
  404. public Range getEndnoteRange() {
  405. return getRange(SubdocumentType.ENDNOTE);
  406. }
  407. /**
  408. * Returns the {@link Range} which covers all annotations.
  409. *
  410. * @return the {@link Range} which covers all annotations.
  411. */
  412. public Range getCommentsRange() {
  413. return getRange(SubdocumentType.ANNOTATION);
  414. }
  415. /**
  416. * Returns the {@link Range} which covers all textboxes.
  417. *
  418. * @return the {@link Range} which covers all textboxes.
  419. */
  420. public Range getMainTextboxRange() {
  421. return getRange(SubdocumentType.TEXTBOX);
  422. }
  423. /**
  424. * Returns the range which covers all "Header Stories".
  425. * A header story contains a header, footer, end note
  426. * separators and footnote separators.
  427. */
  428. public Range getHeaderStoryRange() {
  429. return getRange(SubdocumentType.HEADER);
  430. }
  431. /**
  432. * Returns the character length of a document.
  433. *
  434. * @return the character length of a document
  435. */
  436. public int characterLength() {
  437. return _text.length();
  438. }
  439. /**
  440. * Gets a reference to the saved -by table, which holds the save history for the document.
  441. *
  442. * @return the saved-by table.
  443. */
  444. @Internal
  445. public SavedByTable getSavedByTable() {
  446. return _sbt;
  447. }
  448. /**
  449. * Gets a reference to the revision mark author table, which holds the revision mark authors for the document.
  450. *
  451. * @return the saved-by table.
  452. */
  453. @Internal
  454. public RevisionMarkAuthorTable getRevisionMarkAuthorTable() {
  455. return _rmat;
  456. }
  457. /**
  458. * @return PicturesTable object, that is able to extract images from this document
  459. */
  460. public PicturesTable getPicturesTable() {
  461. return _pictures;
  462. }
  463. @Internal
  464. public OfficeArtContent getOfficeArtContent() {
  465. return officeArtContent;
  466. }
  467. public OfficeDrawings getOfficeDrawingsHeaders() {
  468. return _officeDrawingsHeaders;
  469. }
  470. public OfficeDrawings getOfficeDrawingsMain() {
  471. return _officeDrawingsMain;
  472. }
  473. /**
  474. * @return user-friendly interface to access document bookmarks
  475. */
  476. public Bookmarks getBookmarks() {
  477. return _bookmarks;
  478. }
  479. /**
  480. * @return user-friendly interface to access document endnotes
  481. */
  482. public Notes getEndnotes() {
  483. return _endnotes;
  484. }
  485. /**
  486. * @return user-friendly interface to access document footnotes
  487. */
  488. public Notes getFootnotes() {
  489. return _footnotes;
  490. }
  491. /**
  492. * Returns user-friendly interface to access document {@link Field}s
  493. *
  494. * @return user-friendly interface to access document {@link Field}s
  495. */
  496. public Fields getFields() {
  497. return _fields;
  498. }
  499. /**
  500. * Write out the word file that is represented by this class, to the
  501. * currently open {@link File}, via the writeable {@link POIFSFileSystem}
  502. * it was opened as.
  503. *
  504. * <p>This will fail (with an {@link IllegalStateException} if the
  505. * Document was opened read-only, opened from an {@link InputStream}
  506. * instead of a File, or if this is not the root document. For those cases,
  507. * you must use {@link #write(OutputStream)} or {@link #write(File)} to
  508. * write to a brand new document.
  509. *
  510. * @since 3.15
  511. */
  512. @Override
  513. public void write() throws IOException {
  514. validateInPlaceWritePossible();
  515. // Update the Document+Properties streams in the file
  516. write(getDirectory().getFileSystem(), false);
  517. // Sync with the File on disk
  518. getDirectory().getFileSystem().writeFilesystem();
  519. }
  520. /**
  521. * Writes out the word file that is represented by an instance of this class.
  522. * <p>
  523. * If the {@link File} exists, it will be replaced, otherwise a new one
  524. * will be created
  525. *
  526. * @param newFile The File to write to.
  527. * @throws IOException If there is an unexpected IOException from writing
  528. * to the File.
  529. * @since 3.15 beta 3
  530. */
  531. @Override
  532. public void write(File newFile) throws IOException {
  533. POIFSFileSystem pfs = POIFSFileSystem.create(newFile);
  534. write(pfs, true);
  535. pfs.writeFilesystem();
  536. }
  537. /**
  538. * Writes out the word file that is represented by an instance of this class.
  539. * <p>
  540. * For better performance when writing to files, use {@link #write(File)}.
  541. * If {@code stream} has a high cost/latency associated with each written byte,
  542. * consider wrapping the OutputStream in a {@link java.io.BufferedOutputStream}
  543. * to improve write performance.
  544. *
  545. * @param out The OutputStream to write to.
  546. * @throws IOException If there is an unexpected IOException from the passed
  547. * in OutputStream.
  548. */
  549. @Override
  550. public void write(OutputStream out) throws IOException {
  551. POIFSFileSystem pfs = new POIFSFileSystem();
  552. write(pfs, true);
  553. pfs.writeFilesystem(out);
  554. }
  555. private void write(POIFSFileSystem pfs, boolean copyOtherEntries) throws IOException {
  556. // clear the offsets and sizes in our FileInformationBlock.
  557. _fib.clearOffsetsSizes();
  558. // determine the FileInformationBLock size
  559. int fibSize = _fib.getSize();
  560. fibSize += POIFSConstants.SMALLER_BIG_BLOCK_SIZE - (fibSize % POIFSConstants.SMALLER_BIG_BLOCK_SIZE);
  561. // initialize our streams for writing.
  562. HWPFFileSystem docSys = new HWPFFileSystem();
  563. ByteArrayOutputStream wordDocumentStream = docSys.getStream(STREAM_WORD_DOCUMENT);
  564. ByteArrayOutputStream tableStream = docSys.getStream(STREAM_TABLE_1);
  565. // preserve space for the FileInformationBlock because we will be writing
  566. // it after we write everything else.
  567. byte[] placeHolder = IOUtils.safelyAllocate(fibSize, MAX_RECORD_LENGTH);
  568. wordDocumentStream.write(placeHolder);
  569. int mainOffset = wordDocumentStream.size();
  570. int tableOffset = 0;
  571. // write out EncryptionInfo
  572. updateEncryptionInfo();
  573. EncryptionInfo ei = getEncryptionInfo();
  574. if (ei != null) {
  575. byte[] buf = new byte[1000];
  576. LittleEndianByteArrayOutputStream leos = new LittleEndianByteArrayOutputStream(buf, 0);
  577. leos.writeShort(ei.getVersionMajor());
  578. leos.writeShort(ei.getVersionMinor());
  579. if (ei.getEncryptionMode() == EncryptionMode.cryptoAPI) {
  580. leos.writeInt(ei.getEncryptionFlags());
  581. }
  582. ((EncryptionRecord) ei.getHeader()).write(leos);
  583. ((EncryptionRecord) ei.getVerifier()).write(leos);
  584. tableStream.write(buf, 0, leos.getWriteIndex());
  585. tableOffset += leos.getWriteIndex();
  586. _fib.getFibBase().setLKey(tableOffset);
  587. }
  588. // write out the StyleSheet.
  589. _fib.setFcStshf(tableOffset);
  590. _ss.writeTo(tableStream);
  591. _fib.setLcbStshf(tableStream.size() - tableOffset);
  592. tableOffset = tableStream.size();
  593. // get fcMin and fcMac because we will be writing the actual text with the
  594. // complex table.
  595. /*
  596. * clx (encoding of the sprm lists for a complex file and piece table
  597. * for a any file) Written immediately after the end of the previously
  598. * recorded structure. This is recorded in all Word documents
  599. *
  600. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  601. * Specification; Page 23 of 210
  602. */
  603. // write out the Complex table, includes text.
  604. _fib.setFcClx(tableOffset);
  605. _cft.writeTo(wordDocumentStream, tableStream);
  606. _fib.setLcbClx(tableStream.size() - tableOffset);
  607. tableOffset = tableStream.size();
  608. int fcMac = wordDocumentStream.size();
  609. /*
  610. * dop (document properties record) Written immediately after the end of
  611. * the previously recorded structure. This is recorded in all Word
  612. * documents
  613. *
  614. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  615. * Specification; Page 23 of 210
  616. */
  617. // write out the DocumentProperties.
  618. _fib.setFcDop(tableOffset);
  619. _dop.writeTo(tableStream);
  620. _fib.setLcbDop(tableStream.size() - tableOffset);
  621. tableOffset = tableStream.size();
  622. /*
  623. * plcfBkmkf (table recording beginning CPs of bookmarks) Written
  624. * immediately after the sttbfBkmk, if the document contains bookmarks.
  625. *
  626. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  627. * Specification; Page 24 of 210
  628. */
  629. if (_bookmarksTables != null) {
  630. _bookmarksTables.writePlcfBkmkf(_fib, tableStream);
  631. tableOffset = tableStream.size();
  632. }
  633. /*
  634. * plcfBkmkl (table recording limit CPs of bookmarks) Written
  635. * immediately after the plcfBkmkf, if the document contains bookmarks.
  636. *
  637. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  638. * Specification; Page 24 of 210
  639. */
  640. if (_bookmarksTables != null) {
  641. _bookmarksTables.writePlcfBkmkl(_fib, tableStream);
  642. tableOffset = tableStream.size();
  643. }
  644. /*
  645. * plcfbteChpx (bin table for CHP FKPs) Written immediately after the
  646. * previously recorded table. This is recorded in all Word documents.
  647. *
  648. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  649. * Specification; Page 24 of 210
  650. */
  651. // write out the CHPBinTable.
  652. _fib.setFcPlcfbteChpx(tableOffset);
  653. _cbt.writeTo(wordDocumentStream, tableStream, mainOffset, _cft.getTextPieceTable());
  654. _fib.setLcbPlcfbteChpx(tableStream.size() - tableOffset);
  655. tableOffset = tableStream.size();
  656. /*
  657. * plcfbtePapx (bin table for PAP FKPs) Written immediately after the
  658. * plcfbteChpx. This is recorded in all Word documents.
  659. *
  660. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  661. * Specification; Page 24 of 210
  662. */
  663. // write out the PAPBinTable.
  664. _fib.setFcPlcfbtePapx(tableOffset);
  665. _pbt.writeTo(wordDocumentStream, tableStream, _cft.getTextPieceTable());
  666. _fib.setLcbPlcfbtePapx(tableStream.size() - tableOffset);
  667. tableOffset = tableStream.size();
  668. /*
  669. * plcfendRef (endnote reference position table) Written immediately
  670. * after the previously recorded table if the document contains endnotes
  671. *
  672. * plcfendTxt (endnote text position table) Written immediately after
  673. * the plcfendRef if the document contains endnotes
  674. *
  675. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  676. * Specification; Page 24 of 210
  677. */
  678. _endnotesTables.writeRef(_fib, tableStream);
  679. _endnotesTables.writeTxt(_fib, tableStream);
  680. tableOffset = tableStream.size();
  681. /*
  682. * plcffld*** (table of field positions and statuses for annotation
  683. * subdocument) Written immediately after the previously recorded table,
  684. * if the ******* subdocument contains fields.
  685. *
  686. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  687. * Specification; Page 24 of 210
  688. */
  689. if (_fieldsTables != null) {
  690. _fieldsTables.write(_fib, tableStream);
  691. tableOffset = tableStream.size();
  692. }
  693. /*
  694. * plcffndRef (footnote reference position table) Written immediately
  695. * after the stsh if the document contains footnotes
  696. *
  697. * plcffndTxt (footnote text position table) Written immediately after
  698. * the plcffndRef if the document contains footnotes
  699. *
  700. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  701. * Specification; Page 24 of 210
  702. */
  703. _footnotesTables.writeRef(_fib, tableStream);
  704. _footnotesTables.writeTxt(_fib, tableStream);
  705. tableOffset = tableStream.size();
  706. /*
  707. * plcfsed (section table) Written immediately after the previously
  708. * recorded table. Recorded in all Word documents
  709. *
  710. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  711. * Specification; Page 25 of 210
  712. */
  713. // write out the SectionTable.
  714. _fib.setFcPlcfsed(tableOffset);
  715. _st.writeTo(wordDocumentStream, tableStream);
  716. _fib.setLcbPlcfsed(tableStream.size() - tableOffset);
  717. tableOffset = tableStream.size();
  718. // write out the list tables
  719. if (_lt != null) {
  720. /*
  721. * plcflst (list formats) Written immediately after the end of the
  722. * previously recorded, if there are any lists defined in the
  723. * document. This begins with a short count of LSTF structures
  724. * followed by those LSTF structures. This is immediately followed
  725. * by the allocated data hanging off the LSTFs. This data consists
  726. * of the array of LVLs for each LSTF. (Each LVL consists of an LVLF
  727. * followed by two grpprls and an XST.)
  728. *
  729. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  730. * Specification; Page 25 of 210
  731. */
  732. _lt.writeListDataTo(_fib, tableStream);
  733. tableOffset = tableStream.size();
  734. /*
  735. * plflfo (more list formats) Written immediately after the end of
  736. * the plcflst and its accompanying data, if there are any lists
  737. * defined in the document. This consists first of a PL of LFO
  738. * records, followed by the allocated data (if any) hanging off the
  739. * LFOs. The allocated data consists of the array of LFOLVLFs for
  740. * each LFO (and each LFOLVLF is immediately followed by some LVLs).
  741. *
  742. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  743. * Specification; Page 26 of 210
  744. */
  745. _lt.writeListOverridesTo(_fib, tableStream);
  746. tableOffset = tableStream.size();
  747. }
  748. /*
  749. * sttbfBkmk (table of bookmark name strings) Written immediately after
  750. * the previously recorded table, if the document contains bookmarks.
  751. *
  752. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  753. * Specification; Page 27 of 210
  754. */
  755. if (_bookmarksTables != null) {
  756. _bookmarksTables.writeSttbfBkmk(_fib, tableStream);
  757. tableOffset = tableStream.size();
  758. }
  759. /*
  760. * sttbSavedBy (last saved by string table) Written immediately after
  761. * the previously recorded table.
  762. *
  763. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  764. * Specification; Page 27 of 210
  765. */
  766. // write out the saved-by table.
  767. if (_sbt != null) {
  768. _fib.setFcSttbSavedBy(tableOffset);
  769. _sbt.writeTo(tableStream);
  770. _fib.setLcbSttbSavedBy(tableStream.size() - tableOffset);
  771. tableOffset = tableStream.size();
  772. }
  773. // write out the revision mark authors table.
  774. if (_rmat != null) {
  775. _fib.setFcSttbfRMark(tableOffset);
  776. _rmat.writeTo(tableStream);
  777. _fib.setLcbSttbfRMark(tableStream.size() - tableOffset);
  778. tableOffset = tableStream.size();
  779. }
  780. // write out the FontTable.
  781. _fib.setFcSttbfffn(tableOffset);
  782. _ft.writeTo(tableStream);
  783. _fib.setLcbSttbfffn(tableStream.size() - tableOffset);
  784. tableOffset = tableStream.size();
  785. // set some variables in the FileInformationBlock.
  786. _fib.getFibBase().setFcMin(mainOffset);
  787. _fib.getFibBase().setFcMac(fcMac);
  788. _fib.setCbMac(wordDocumentStream.size());
  789. // make sure that the table, doc and data streams use big blocks.
  790. byte[] mainBuf = fillUp4096(wordDocumentStream);
  791. // Table1 stream will be used
  792. _fib.getFibBase().setFWhichTblStm(true);
  793. // write out the FileInformationBlock.
  794. //_fib.serialize(mainBuf, 0);
  795. _fib.writeTo(mainBuf, tableStream);
  796. byte[] tableBuf = fillUp4096(tableStream);
  797. byte[] dataBuf = fillUp4096(_dataStream);
  798. // Create a new document - ignoring the order of the old entries
  799. if (ei == null) {
  800. write(pfs, mainBuf, STREAM_WORD_DOCUMENT);
  801. write(pfs, tableBuf, STREAM_TABLE_1);
  802. write(pfs, dataBuf, STREAM_DATA);
  803. } else {
  804. ByteArrayOutputStream bos = new ByteArrayOutputStream(100000);
  805. encryptBytes(mainBuf, FIB_BASE_LEN, bos);
  806. write(pfs, bos.toByteArray(), STREAM_WORD_DOCUMENT);
  807. bos.reset();
  808. encryptBytes(tableBuf, _fib.getFibBase().getLKey(), bos);
  809. write(pfs, bos.toByteArray(), STREAM_TABLE_1);
  810. bos.reset();
  811. encryptBytes(dataBuf, 0, bos);
  812. write(pfs, bos.toByteArray(), STREAM_DATA);
  813. bos.reset();
  814. }
  815. writeProperties(pfs);
  816. if (copyOtherEntries && ei == null) {
  817. // For encrypted files:
  818. // The ObjectPool storage MUST NOT be present and if the file contains OLE objects, the storage
  819. // objects for the OLE objects MUST be stored in the Data stream as specified in sprmCPicLocation.
  820. DirectoryNode newRoot = pfs.getRoot();
  821. _objectPool.writeTo(newRoot);
  822. for (Entry entry : getDirectory()) {
  823. String entryName = entry.getName();
  824. if (!(
  825. STREAM_WORD_DOCUMENT.equals(entryName) ||
  826. STREAM_TABLE_0.equals(entryName) ||
  827. STREAM_TABLE_1.equals(entryName) ||
  828. STREAM_DATA.equals(entryName) ||
  829. STREAM_OBJECT_POOL.equals(entryName) ||
  830. SummaryInformation.DEFAULT_STREAM_NAME.equals(entryName) ||
  831. DocumentSummaryInformation.DEFAULT_STREAM_NAME.equals(entryName)
  832. )) {
  833. EntryUtils.copyNodeRecursively(entry, newRoot);
  834. }
  835. }
  836. }
  837. /*
  838. * since we updated all references in FIB and etc, using new arrays to
  839. * access data
  840. */
  841. replaceDirectory(pfs.getRoot());
  842. this._tableStream = tableStream.toByteArray();
  843. this._dataStream = dataBuf;
  844. }
  845. private void encryptBytes(byte[] plain, int encryptOffset, OutputStream bos) throws IOException {
  846. try {
  847. EncryptionInfo ei = getEncryptionInfo();
  848. Encryptor enc = ei.getEncryptor();
  849. enc.setChunkSize(RC4_REKEYING_INTERVAL);
  850. ChunkedCipherOutputStream os = enc.getDataStream(bos, 0);
  851. if (encryptOffset > 0) {
  852. os.writePlain(plain, 0, encryptOffset);
  853. }
  854. os.write(plain, encryptOffset, plain.length - encryptOffset);
  855. os.close();
  856. } catch (GeneralSecurityException e) {
  857. throw new IOException(e);
  858. }
  859. }
  860. private static byte[] fillUp4096(byte[] buf) {
  861. if (buf == null) {
  862. return new byte[4096];
  863. } else if (buf.length < 4096) {
  864. ByteArrayOutputStream bos = new ByteArrayOutputStream(4096);
  865. bos.write(buf, 0, buf.length);
  866. return fillUp4096(bos);
  867. } else {
  868. return buf;
  869. }
  870. }
  871. private static byte[] fillUp4096(ByteArrayOutputStream bos) {
  872. int fillSize = 4096 - bos.size();
  873. if (fillSize > 0) {
  874. bos.write(new byte[fillSize], 0, fillSize);
  875. }
  876. return bos.toByteArray();
  877. }
  878. private static void write(POIFSFileSystem pfs, byte[] data, String name) throws IOException {
  879. pfs.createOrUpdateDocument(new ByteArrayInputStream(data), name);
  880. }
  881. @Internal
  882. public byte[] getDataStream() {
  883. return _dataStream;
  884. }
  885. @Internal
  886. public byte[] getTableStream() {
  887. return _tableStream;
  888. }
  889. public int registerList(HWPFList list) {
  890. if (_lt == null) {
  891. _lt = new ListTables();
  892. }
  893. return _lt.addList(list.getListData(), list.getLFO(),
  894. list.getLFOData());
  895. }
  896. public void delete(int start, int length) {
  897. Range r = new Range(start, start + length, this);
  898. r.delete();
  899. }
  900. }