You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

SectionTable.java 8.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.model;
  16. import java.io.ByteArrayOutputStream;
  17. import java.io.IOException;
  18. import java.util.ArrayList;
  19. import java.util.List;
  20. import org.apache.poi.hwpf.model.io.HWPFFileSystem;
  21. import org.apache.poi.util.IOUtils;
  22. import org.apache.poi.util.Internal;
  23. import org.apache.poi.util.LittleEndian;
  24. import org.apache.poi.util.LittleEndianConsts;
  25. import org.apache.poi.util.POILogFactory;
  26. import org.apache.poi.util.POILogger;
  27. @Internal
  28. public class SectionTable
  29. {
  30. //arbitrarily selected; may need to increase
  31. private static final int MAX_RECORD_LENGTH = 100_000;
  32. private final static POILogger _logger = POILogFactory.getLogger(SectionTable.class);
  33. private static final int SED_SIZE = 12;
  34. protected List<SEPX> _sections = new ArrayList<>();
  35. protected List<TextPiece> _text;
  36. /** So we can know if things are unicode or not */
  37. //private TextPieceTable tpt;
  38. public SectionTable()
  39. {
  40. }
  41. public SectionTable(
  42. byte[] documentStream, byte[] tableStream,
  43. int offset, int size, int fcMin, TextPieceTable tpt, int mainLength)
  44. {
  45. PlexOfCps sedPlex = new PlexOfCps(tableStream, offset, size, SED_SIZE);
  46. //this.tpt = tpt;
  47. this._text = tpt.getTextPieces();
  48. int length = sedPlex.length();
  49. for (int x = 0; x < length; x++)
  50. {
  51. GenericPropertyNode node = sedPlex.getProperty(x);
  52. SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);
  53. int fileOffset = sed.getFc();
  54. // int startAt = CPtoFC(node.getStart());
  55. // int endAt = CPtoFC(node.getEnd());
  56. int startAt = node.getStart();
  57. int endAt = node.getEnd();
  58. // check for the optimization
  59. if (fileOffset == 0xffffffff)
  60. {
  61. _sections.add(new SEPX(sed, startAt, endAt, new byte[0]));
  62. }
  63. else
  64. {
  65. // The first short at the offset is the size of the grpprl.
  66. int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
  67. byte[] buf = IOUtils.safelyAllocate(sepxSize, MAX_RECORD_LENGTH);
  68. fileOffset += LittleEndianConsts.SHORT_SIZE;
  69. System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
  70. _sections.add(new SEPX(sed, startAt, endAt, buf));
  71. }
  72. }
  73. // Some files seem to lie about their unicode status, which
  74. // is very very pesky. Try to work around these, but this
  75. // is getting on for black magic...
  76. boolean matchAt = false;
  77. boolean matchHalf = false;
  78. for (SEPX s : _sections) {
  79. if (s.getEnd() == mainLength) {
  80. matchAt = true;
  81. } else if (s.getEnd() == mainLength || s.getEnd() == mainLength - 1) {
  82. matchHalf = true;
  83. }
  84. }
  85. if(! matchAt && matchHalf) {
  86. _logger.log(POILogger.WARN, "Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
  87. for(int i=0; i<_sections.size(); i++) {
  88. SEPX s = _sections.get(i);
  89. GenericPropertyNode node = sedPlex.getProperty(i);
  90. // s.setStart( CPtoFC(node.getStart()) );
  91. // s.setEnd( CPtoFC(node.getEnd()) );
  92. int startAt = node.getStart();
  93. int endAt = node.getEnd();
  94. s.setStart( startAt );
  95. s.setEnd( endAt );
  96. }
  97. }
  98. _sections.sort(PropertyNode.StartComparator);
  99. }
  100. public void adjustForInsert(int listIndex, int length)
  101. {
  102. int size = _sections.size();
  103. SEPX sepx = _sections.get(listIndex);
  104. sepx.setEnd(sepx.getEnd() + length);
  105. for (int x = listIndex + 1; x < size; x++)
  106. {
  107. sepx = _sections.get(x);
  108. sepx.setStart(sepx.getStart() + length);
  109. sepx.setEnd(sepx.getEnd() + length);
  110. }
  111. }
  112. // goss version of CPtoFC - this takes into account non-contiguous textpieces
  113. // that we have come across in real world documents. Tests against the example
  114. // code in HWPFDocument show no variation to Ryan's version of the code in
  115. // normal use, but this version works with our non-contiguous test case.
  116. // So far unable to get this test case to be written out as well due to
  117. // other issues. - piers
  118. //
  119. // i'm commenting this out, because it just doesn't work with non-contiguous
  120. // textpieces :( Usual (as for PAPX and CHPX) call to TextPiecesTable does.
  121. // private int CPtoFC(int CP)
  122. // {
  123. // TextPiece TP = null;
  124. //
  125. // for(int i=_text.size()-1; i>-1; i--)
  126. // {
  127. // TP = _text.get(i);
  128. //
  129. // if(CP >= TP.getCP()) break;
  130. // }
  131. // int FC = TP.getPieceDescriptor().getFilePosition();
  132. // int offset = CP - TP.getCP();
  133. // if (TP.isUnicode()) {
  134. // offset = offset*2;
  135. // }
  136. // FC = FC+offset;
  137. // return FC;
  138. // }
  139. public List<SEPX> getSections()
  140. {
  141. return _sections;
  142. }
  143. @Deprecated
  144. public void writeTo( HWPFFileSystem sys, int fcMin ) throws IOException
  145. {
  146. ByteArrayOutputStream docStream = sys.getStream( "WordDocument" );
  147. ByteArrayOutputStream tableStream = sys.getStream( "1Table" );
  148. writeTo( docStream, tableStream );
  149. }
  150. public void writeTo(
  151. ByteArrayOutputStream wordDocumentStream,
  152. ByteArrayOutputStream tableStream ) throws IOException
  153. {
  154. int offset = wordDocumentStream.size();
  155. //int len = _sections.size();
  156. PlexOfCps plex = new PlexOfCps(SED_SIZE);
  157. for (SEPX sepx : _sections) {
  158. byte[] grpprl = sepx.getGrpprl();
  159. // write the sepx to the document stream. starts with a 2 byte size
  160. // followed by the grpprl
  161. byte[] shortBuf = new byte[2];
  162. LittleEndian.putShort(shortBuf, 0, (short) grpprl.length);
  163. wordDocumentStream.write(shortBuf);
  164. wordDocumentStream.write(grpprl);
  165. // set the fc in the section descriptor
  166. SectionDescriptor sed = sepx.getSectionDescriptor();
  167. sed.setFc(offset);
  168. // add the section descriptor bytes to the PlexOfCps.
  169. /* original line */
  170. GenericPropertyNode property = new GenericPropertyNode(
  171. sepx.getStart(), sepx.getEnd(), sed.toByteArray());
  172. /*
  173. * Line using Ryan's FCtoCP() conversion method - unable to observe
  174. * any effect on our testcases when using this code - piers
  175. */
  176. /*
  177. * there is an effect on Bug45743.doc actually. writeoutreadback
  178. * changes byte offset of chars (but preserve string offsets).
  179. * Changing back to original lines - sergey
  180. */
  181. // GenericPropertyNode property = new GenericPropertyNode(
  182. // tpt.getCharIndex( sepx.getStartBytes() ),
  183. // tpt.getCharIndex( sepx.getEndBytes() ), sed.toByteArray() );
  184. plex.addProperty(property);
  185. offset = wordDocumentStream.size();
  186. }
  187. tableStream.write(plex.toByteArray());
  188. }
  189. }