Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

HWPFOldDocument.java 8.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf;
  16. import java.io.File;
  17. import java.io.IOException;
  18. import java.io.OutputStream;
  19. import java.nio.charset.Charset;
  20. import org.apache.poi.hwmf.record.HwmfFont;
  21. import org.apache.poi.hwpf.model.ComplexFileTable;
  22. import org.apache.poi.hwpf.model.FontTable;
  23. import org.apache.poi.hwpf.model.OldCHPBinTable;
  24. import org.apache.poi.hwpf.model.OldComplexFileTable;
  25. import org.apache.poi.hwpf.model.OldFfn;
  26. import org.apache.poi.hwpf.model.OldFontTable;
  27. import org.apache.poi.hwpf.model.OldPAPBinTable;
  28. import org.apache.poi.hwpf.model.OldSectionTable;
  29. import org.apache.poi.hwpf.model.OldTextPieceTable;
  30. import org.apache.poi.hwpf.model.PieceDescriptor;
  31. import org.apache.poi.hwpf.model.TextPiece;
  32. import org.apache.poi.hwpf.model.TextPieceTable;
  33. import org.apache.poi.hwpf.usermodel.Range;
  34. import org.apache.poi.poifs.filesystem.DirectoryNode;
  35. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  36. import org.apache.poi.util.CodePageUtil;
  37. import org.apache.poi.util.LittleEndian;
  38. import org.apache.poi.util.NotImplemented;
  39. import org.apache.poi.util.StringUtil;
  40. /**
  41. * Provides very simple support for old (Word 6 / Word 95)
  42. * files.
  43. */
  44. public class HWPFOldDocument extends HWPFDocumentCore {
  45. private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
  46. private OldTextPieceTable tpt;
  47. private StringBuilder _text;
  48. private final OldFontTable fontTable;
  49. private final Charset guessedCharset;
  50. public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
  51. this(fs.getRoot());
  52. }
  53. public HWPFOldDocument(DirectoryNode directory)
  54. throws IOException {
  55. super(directory);
  56. // Where are things?
  57. int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
  58. int sedTableSize = LittleEndian.getInt(_mainStream, 0x8c);
  59. int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
  60. int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
  61. int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
  62. int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
  63. int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
  64. int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
  65. fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize);
  66. //TODO: figure out how to map runs/text pieces to fonts
  67. //for now, if there's a non standard codepage in one of the fonts
  68. //assume that the doc is in that codepage.
  69. guessedCharset = guessCodePage(fontTable);
  70. int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
  71. // We need to get hold of the text that makes up the
  72. // document, which might be regular or fast-saved
  73. ComplexFileTable cft = null;
  74. if(_fib.getFibBase().isFComplex()) {
  75. cft = new OldComplexFileTable(
  76. _mainStream, _mainStream,
  77. complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset
  78. );
  79. tpt = (OldTextPieceTable)cft.getTextPieceTable();
  80. } else {
  81. // TODO Discover if these older documents can ever hold Unicode Strings?
  82. // (We think not, because they seem to lack a Piece table)
  83. // TODO Build the Piece Descriptor properly
  84. // (We have to fake it, as they don't seem to have a proper Piece table)
  85. PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
  86. pd.setFilePosition(_fib.getFibBase().getFcMin());
  87. // Generate a single Text Piece Table, with a single Text Piece
  88. // which covers all the (8 bit only) text in the file
  89. tpt = new OldTextPieceTable();
  90. byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
  91. System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
  92. int numChars = textData.length;
  93. if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
  94. numChars /= 2;
  95. }
  96. TextPiece tp = new TextPiece(
  97. 0, numChars, textData, pd
  98. );
  99. tpt.add(tp);
  100. }
  101. _text = tpt.getText();
  102. // Now we can fetch the character and paragraph properties
  103. _cbt = new OldCHPBinTable(
  104. _mainStream, chpTableOffset, chpTableSize,
  105. _fib.getFibBase().getFcMin(), tpt
  106. );
  107. _pbt = new OldPAPBinTable(
  108. _mainStream, papTableOffset, papTableSize,
  109. _fib.getFibBase().getFcMin(), tpt
  110. );
  111. _st = new OldSectionTable(
  112. _mainStream, sedTableOffset, sedTableSize,
  113. _fib.getFibBase().getFcMin(), tpt
  114. );
  115. /*
  116. * in this mode we preserving PAPX/CHPX structure from file, so text may
  117. * miss from output, and text order may be corrupted
  118. */
  119. boolean preserveBinTables = false;
  120. try
  121. {
  122. preserveBinTables = Boolean.parseBoolean( System
  123. .getProperty( HWPFDocument.PROPERTY_PRESERVE_BIN_TABLES ) );
  124. }
  125. catch ( Exception exc )
  126. {
  127. // ignore;
  128. }
  129. if ( !preserveBinTables )
  130. {
  131. _cbt.rebuild( cft );
  132. _pbt.rebuild( _text, cft );
  133. }
  134. }
  135. /**
  136. * Take the first codepage that is not default, ansi or symbol.
  137. * Ideally, we'd want to track fonts with runs, but we don't yet
  138. * know how to do that.
  139. *
  140. * Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi
  141. * appears here.
  142. *
  143. * @param fontTable
  144. * @return
  145. */
  146. private Charset guessCodePage(OldFontTable fontTable) {
  147. for (OldFfn oldFfn : fontTable.getFontNames()) {
  148. HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
  149. if (wmfCharset != null &&
  150. wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
  151. wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
  152. wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
  153. return wmfCharset.getCharset();
  154. }
  155. }
  156. return DEFAULT_CHARSET;
  157. }
  158. public Range getOverallRange()
  159. {
  160. // Life is easy when we have no footers, headers or unicode!
  161. return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this );
  162. }
  163. /**
  164. * Use {@link #getOldFontTable()} instead!!!
  165. * This always throws an IllegalArgumentException.
  166. *
  167. * @return nothing
  168. * @throws UnsupportedOperationException
  169. */
  170. @Override
  171. @NotImplemented
  172. public FontTable getFontTable() {
  173. throw new UnsupportedOperationException("Use getOldFontTable instead.");
  174. }
  175. public OldFontTable getOldFontTable() {
  176. return fontTable;
  177. }
  178. public Range getRange()
  179. {
  180. return getOverallRange();
  181. }
  182. public TextPieceTable getTextTable()
  183. {
  184. return tpt;
  185. }
  186. @Override
  187. public StringBuilder getText()
  188. {
  189. return _text;
  190. }
  191. @Override
  192. public void write() throws IOException {
  193. throw new IllegalStateException("Writing is not available for the older file formats");
  194. }
  195. @Override
  196. public void write(File out) throws IOException {
  197. throw new IllegalStateException("Writing is not available for the older file formats");
  198. }
  199. @Override
  200. public void write(OutputStream out) throws IOException {
  201. throw new IllegalStateException("Writing is not available for the older file formats");
  202. }
  203. /**
  204. * As a rough heuristic (total hack), read through the font table
  205. * and take the first non-default, non-ansi, non-symbol
  206. * font's charset and return that.
  207. *
  208. * Once we figure out how to link a font to a text piece, we should
  209. * use the font information per text piece.
  210. *
  211. * @return charset
  212. */
  213. public Charset getGuessedCharset() {
  214. return guessedCharset;
  215. }
  216. }