You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

CHPBinTable.java 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.model;
  16. import java.io.ByteArrayOutputStream;
  17. import java.io.IOException;
  18. import java.util.ArrayList;
  19. import java.util.Arrays;
  20. import java.util.Collections;
  21. import java.util.Comparator;
  22. import java.util.HashSet;
  23. import java.util.IdentityHashMap;
  24. import java.util.Iterator;
  25. import java.util.LinkedList;
  26. import java.util.List;
  27. import java.util.Map;
  28. import java.util.Set;
  29. import org.apache.poi.hwpf.model.io.HWPFFileSystem;
  30. import org.apache.poi.hwpf.sprm.SprmBuffer;
  31. import org.apache.poi.hwpf.sprm.SprmIterator;
  32. import org.apache.poi.hwpf.sprm.SprmOperation;
  33. import org.apache.poi.poifs.common.POIFSConstants;
  34. import org.apache.poi.util.Internal;
  35. import org.apache.poi.util.LittleEndian;
  36. import org.apache.poi.util.POILogFactory;
  37. import org.apache.poi.util.POILogger;
  38. /**
  39. * This class holds all of the character formatting properties.
  40. *
  41. * @author Ryan Ackley
  42. */
  43. @Internal
  44. public class CHPBinTable
  45. {
  46. private static final POILogger logger = POILogFactory
  47. .getLogger( CHPBinTable.class );
  48. /** List of character properties.*/
  49. protected List<CHPX> _textRuns = new ArrayList<>();
  50. public CHPBinTable()
  51. {
  52. }
  53. /**
  54. * Constructor used to read a binTable in from a Word document.
  55. *
  56. * @deprecated Use
  57. * {@link #CHPBinTable(byte[], byte[], int, int, CharIndexTranslator)}
  58. * instead
  59. */
  60. public CHPBinTable( byte[] documentStream, byte[] tableStream, int offset,
  61. int size, int fcMin, TextPieceTable tpt )
  62. {
  63. this( documentStream, tableStream, offset, size, tpt );
  64. }
  65. /**
  66. * Constructor used to read a binTable in from a Word document.
  67. */
  68. public CHPBinTable( byte[] documentStream, byte[] tableStream, int offset,
  69. int size, CharIndexTranslator translator )
  70. {
  71. long start = System.currentTimeMillis();
  72. /*
  73. * Page 35:
  74. *
  75. * "Associated with each interval is a BTE. A BTE holds a four-byte PN
  76. * (page number) which identifies the FKP page in the file which
  77. * contains the formatting information for that interval. A CHPX FKP
  78. * further partitions an interval into runs of exception text."
  79. */
  80. PlexOfCps bte = new PlexOfCps( tableStream, offset, size, 4 );
  81. int length = bte.length();
  82. for (int x = 0; x < length; x++)
  83. {
  84. GenericPropertyNode node = bte.getProperty(x);
  85. int pageNum = LittleEndian.getInt(node.getBytes());
  86. int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
  87. CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
  88. pageOffset, translator);
  89. for ( CHPX chpx : cfkp.getCHPXs() )
  90. {
  91. if ( chpx != null )
  92. _textRuns.add( chpx );
  93. }
  94. }
  95. logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
  96. Long.valueOf( System.currentTimeMillis() - start ), " ms (",
  97. Integer.valueOf( _textRuns.size() ), " elements)" );
  98. if ( _textRuns.isEmpty() )
  99. {
  100. logger.log( POILogger.WARN, "CHPX FKPs are empty" );
  101. _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
  102. }
  103. }
  104. public void rebuild( ComplexFileTable complexFileTable )
  105. {
  106. long start = System.currentTimeMillis();
  107. if ( complexFileTable != null )
  108. {
  109. SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();
  110. // adding CHPX from fast-saved SPRMs
  111. for ( TextPiece textPiece : complexFileTable.getTextPieceTable()
  112. .getTextPieces() )
  113. {
  114. PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
  115. if ( !prm.isComplex() )
  116. continue;
  117. int igrpprl = prm.getIgrpprl();
  118. if ( igrpprl < 0 || igrpprl >= sprmBuffers.length )
  119. {
  120. logger.log( POILogger.WARN, textPiece
  121. + "'s PRM references to unknown grpprl" );
  122. continue;
  123. }
  124. boolean hasChp = false;
  125. SprmBuffer sprmBuffer = sprmBuffers[igrpprl];
  126. for ( SprmIterator iterator = sprmBuffer.iterator(); iterator
  127. .hasNext(); )
  128. {
  129. SprmOperation sprmOperation = iterator.next();
  130. if ( sprmOperation.getType() == SprmOperation.TYPE_CHP )
  131. {
  132. hasChp = true;
  133. break;
  134. }
  135. }
  136. if ( hasChp )
  137. {
  138. SprmBuffer newSprmBuffer = sprmBuffer.clone();
  139. CHPX chpx = new CHPX( textPiece.getStart(),
  140. textPiece.getEnd(), newSprmBuffer );
  141. _textRuns.add( chpx );
  142. }
  143. }
  144. logger.log( POILogger.DEBUG,
  145. "Merged with CHPX from complex file table in ",
  146. Long.valueOf( System.currentTimeMillis() - start ),
  147. " ms (", Integer.valueOf( _textRuns.size() ),
  148. " elements in total)" );
  149. start = System.currentTimeMillis();
  150. }
  151. List<CHPX> oldChpxSortedByStartPos = new ArrayList<>(_textRuns);
  152. oldChpxSortedByStartPos.sort(PropertyNode.StartComparator);
  153. logger.log( POILogger.DEBUG, "CHPX sorted by start position in ",
  154. Long.valueOf( System.currentTimeMillis() - start ), " ms" );
  155. start = System.currentTimeMillis();
  156. final Map<CHPX, Integer> chpxToFileOrder = new IdentityHashMap<>();
  157. {
  158. int counter = 0;
  159. for ( CHPX chpx : _textRuns )
  160. {
  161. chpxToFileOrder.put( chpx, Integer.valueOf( counter++ ) );
  162. }
  163. }
  164. final Comparator<CHPX> chpxFileOrderComparator = new Comparator<CHPX>()
  165. {
  166. public int compare( CHPX o1, CHPX o2 )
  167. {
  168. Integer i1 = chpxToFileOrder.get( o1 );
  169. Integer i2 = chpxToFileOrder.get( o2 );
  170. return i1.compareTo( i2 );
  171. }
  172. };
  173. logger.log( POILogger.DEBUG, "CHPX's order map created in ",
  174. Long.valueOf( System.currentTimeMillis() - start ), " ms" );
  175. start = System.currentTimeMillis();
  176. List<Integer> textRunsBoundariesList;
  177. {
  178. Set<Integer> textRunsBoundariesSet = new HashSet<>();
  179. for ( CHPX chpx : _textRuns )
  180. {
  181. textRunsBoundariesSet.add( Integer.valueOf( chpx.getStart() ) );
  182. textRunsBoundariesSet.add( Integer.valueOf( chpx.getEnd() ) );
  183. }
  184. textRunsBoundariesSet.remove( Integer.valueOf( 0 ) );
  185. textRunsBoundariesList = new ArrayList<>(
  186. textRunsBoundariesSet);
  187. Collections.sort( textRunsBoundariesList );
  188. }
  189. logger.log( POILogger.DEBUG, "Texts CHPX boundaries collected in ",
  190. Long.valueOf( System.currentTimeMillis() - start ), " ms" );
  191. start = System.currentTimeMillis();
  192. List<CHPX> newChpxs = new LinkedList<>();
  193. int lastTextRunStart = 0;
  194. for ( Integer objBoundary : textRunsBoundariesList )
  195. {
  196. final int boundary = objBoundary.intValue();
  197. final int startInclusive = lastTextRunStart;
  198. lastTextRunStart = boundary;
  199. int startPosition = binarySearch( oldChpxSortedByStartPos, boundary );
  200. startPosition = Math.abs( startPosition );
  201. while ( startPosition >= oldChpxSortedByStartPos.size() )
  202. startPosition--;
  203. while ( startPosition > 0
  204. && oldChpxSortedByStartPos.get( startPosition ).getStart() >= boundary )
  205. startPosition--;
  206. List<CHPX> chpxs = new LinkedList<>();
  207. for ( int c = startPosition; c < oldChpxSortedByStartPos.size(); c++ )
  208. {
  209. CHPX chpx = oldChpxSortedByStartPos.get( c );
  210. if ( boundary < chpx.getStart() )
  211. break;
  212. int left = Math.max( startInclusive, chpx.getStart() );
  213. int right = Math.min(boundary, chpx.getEnd() );
  214. if ( left < right )
  215. {
  216. chpxs.add( chpx );
  217. }
  218. }
  219. if ( chpxs.size() == 0 )
  220. {
  221. logger.log( POILogger.WARN, "Text piece [",
  222. Integer.valueOf( startInclusive ), "; ",
  223. Integer.valueOf(boundary),
  224. ") has no CHPX. Creating new one." );
  225. // create it manually
  226. CHPX chpx = new CHPX( startInclusive, boundary,
  227. new SprmBuffer( 0 ) );
  228. newChpxs.add( chpx );
  229. continue;
  230. }
  231. if ( chpxs.size() == 1 )
  232. {
  233. // can we reuse existing?
  234. CHPX existing = chpxs.get( 0 );
  235. if ( existing.getStart() == startInclusive
  236. && existing.getEnd() == boundary)
  237. {
  238. newChpxs.add( existing );
  239. continue;
  240. }
  241. }
  242. chpxs.sort(chpxFileOrderComparator);
  243. SprmBuffer sprmBuffer = new SprmBuffer( 0 );
  244. for ( CHPX chpx : chpxs )
  245. {
  246. sprmBuffer.append( chpx.getGrpprl(), 0 );
  247. }
  248. CHPX newChpx = new CHPX( startInclusive, boundary, sprmBuffer );
  249. newChpxs.add( newChpx );
  250. continue;
  251. }
  252. this._textRuns = new ArrayList<>(newChpxs);
  253. logger.log( POILogger.DEBUG, "CHPX rebuilded in ",
  254. Long.valueOf( System.currentTimeMillis() - start ), " ms (",
  255. Integer.valueOf( _textRuns.size() ), " elements)" );
  256. start = System.currentTimeMillis();
  257. CHPX previous = null;
  258. for ( Iterator<CHPX> iterator = _textRuns.iterator(); iterator
  259. .hasNext(); )
  260. {
  261. CHPX current = iterator.next();
  262. if ( previous == null )
  263. {
  264. previous = current;
  265. continue;
  266. }
  267. if ( previous.getEnd() == current.getStart()
  268. && Arrays
  269. .equals( previous.getGrpprl(), current.getGrpprl() ) )
  270. {
  271. previous.setEnd( current.getEnd() );
  272. iterator.remove();
  273. continue;
  274. }
  275. previous = current;
  276. }
  277. logger.log( POILogger.DEBUG, "CHPX compacted in ",
  278. Long.valueOf( System.currentTimeMillis() - start ), " ms (",
  279. Integer.valueOf( _textRuns.size() ), " elements)" );
  280. }
  281. private static int binarySearch( List<CHPX> chpxs, int startPosition )
  282. {
  283. int low = 0;
  284. int high = chpxs.size() - 1;
  285. while ( low <= high )
  286. {
  287. int mid = ( low + high ) >>> 1;
  288. CHPX midVal = chpxs.get( mid );
  289. int midValue = midVal.getStart();
  290. if ( midValue < startPosition )
  291. low = mid + 1;
  292. else if ( midValue > startPosition )
  293. high = mid - 1;
  294. else
  295. return mid; // key found
  296. }
  297. return -( low + 1 ); // key not found.
  298. }
  299. public void adjustForDelete(int listIndex, int offset, int length)
  300. {
  301. int size = _textRuns.size();
  302. int endMark = offset + length;
  303. int endIndex = listIndex;
  304. CHPX chpx = _textRuns.get(endIndex);
  305. while (chpx.getEnd() < endMark)
  306. {
  307. chpx = _textRuns.get(++endIndex);
  308. }
  309. if (listIndex == endIndex)
  310. {
  311. chpx = _textRuns.get(endIndex);
  312. chpx.setEnd((chpx.getEnd() - endMark) + offset);
  313. }
  314. else
  315. {
  316. chpx = _textRuns.get(listIndex);
  317. chpx.setEnd(offset);
  318. for (int x = listIndex + 1; x < endIndex; x++)
  319. {
  320. chpx = _textRuns.get(x);
  321. chpx.setStart(offset);
  322. chpx.setEnd(offset);
  323. }
  324. chpx = _textRuns.get(endIndex);
  325. chpx.setEnd((chpx.getEnd() - endMark) + offset);
  326. }
  327. for (int x = endIndex + 1; x < size; x++)
  328. {
  329. chpx = _textRuns.get(x);
  330. chpx.setStart(chpx.getStart() - length);
  331. chpx.setEnd(chpx.getEnd() - length);
  332. }
  333. }
  334. public void insert(int listIndex, int cpStart, SprmBuffer buf)
  335. {
  336. CHPX insertChpx = new CHPX(0, 0, buf);
  337. // Ensure character offsets are really characters
  338. insertChpx.setStart(cpStart);
  339. insertChpx.setEnd(cpStart);
  340. if (listIndex == _textRuns.size())
  341. {
  342. _textRuns.add(insertChpx);
  343. }
  344. else
  345. {
  346. CHPX chpx = _textRuns.get(listIndex);
  347. if (chpx.getStart() < cpStart)
  348. {
  349. // Copy the properties of the one before to afterwards
  350. // Will go:
  351. // Original, until insert at point
  352. // New one
  353. // Clone of original, on to the old end
  354. CHPX clone = new CHPX(0, 0, chpx.getSprmBuf());
  355. // Again ensure contains character based offsets no matter what
  356. clone.setStart(cpStart);
  357. clone.setEnd(chpx.getEnd());
  358. chpx.setEnd(cpStart);
  359. _textRuns.add(listIndex + 1, insertChpx);
  360. _textRuns.add(listIndex + 2, clone);
  361. }
  362. else
  363. {
  364. _textRuns.add(listIndex, insertChpx);
  365. }
  366. }
  367. }
  368. public void adjustForInsert(int listIndex, int length)
  369. {
  370. int size = _textRuns.size();
  371. CHPX chpx = _textRuns.get(listIndex);
  372. chpx.setEnd(chpx.getEnd() + length);
  373. for (int x = listIndex + 1; x < size; x++)
  374. {
  375. chpx = _textRuns.get(x);
  376. chpx.setStart(chpx.getStart() + length);
  377. chpx.setEnd(chpx.getEnd() + length);
  378. }
  379. }
  380. public List<CHPX> getTextRuns()
  381. {
  382. return _textRuns;
  383. }
  384. @Deprecated
  385. public void writeTo( HWPFFileSystem sys, int fcMin,
  386. CharIndexTranslator translator ) throws IOException
  387. {
  388. ByteArrayOutputStream docStream = sys.getStream( "WordDocument" );
  389. ByteArrayOutputStream tableStream = sys.getStream( "1Table" );
  390. writeTo( docStream, tableStream, fcMin, translator );
  391. }
  392. public void writeTo( ByteArrayOutputStream wordDocumentStream,
  393. ByteArrayOutputStream tableStream, int fcMin,
  394. CharIndexTranslator translator ) throws IOException
  395. {
  396. /*
  397. * Page 35:
  398. *
  399. * "Associated with each interval is a BTE. A BTE holds a four-byte PN
  400. * (page number) which identifies the FKP page in the file which
  401. * contains the formatting information for that interval. A CHPX FKP
  402. * further partitions an interval into runs of exception text."
  403. */
  404. PlexOfCps bte = new PlexOfCps( 4 );
  405. // each FKP must start on a 512 byte page.
  406. int docOffset = wordDocumentStream.size();
  407. int mod = docOffset % POIFSConstants.SMALLER_BIG_BLOCK_SIZE;
  408. if (mod != 0)
  409. {
  410. byte[] padding = new byte[POIFSConstants.SMALLER_BIG_BLOCK_SIZE - mod];
  411. wordDocumentStream.write(padding);
  412. }
  413. // get the page number for the first fkp
  414. docOffset = wordDocumentStream.size();
  415. int pageNum = docOffset/POIFSConstants.SMALLER_BIG_BLOCK_SIZE;
  416. // get the ending fc
  417. // CHPX lastRun = _textRuns.get(_textRuns.size() - 1);
  418. // int endingFc = lastRun.getEnd();
  419. // endingFc += fcMin;
  420. int endingFc = translator.getByteIndex( _textRuns.get(
  421. _textRuns.size() - 1 ).getEnd() );
  422. List<CHPX> overflow = _textRuns;
  423. do
  424. {
  425. CHPX startingProp = overflow.get(0);
  426. // int start = startingProp.getStart() + fcMin;
  427. int start = translator.getByteIndex( startingProp.getStart() );
  428. CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage();
  429. cfkp.fill(overflow);
  430. byte[] bufFkp = cfkp.toByteArray( translator );
  431. wordDocumentStream.write(bufFkp);
  432. overflow = cfkp.getOverflow();
  433. int end = endingFc;
  434. if (overflow != null)
  435. {
  436. // end = overflow.get(0).getStart() + fcMin;
  437. end = translator.getByteIndex( overflow.get( 0 ).getStart() );
  438. }
  439. byte[] intHolder = new byte[4];
  440. LittleEndian.putInt(intHolder, 0, pageNum++);
  441. bte.addProperty(new GenericPropertyNode(start, end, intHolder));
  442. }
  443. while (overflow != null);
  444. tableStream.write(bte.toByteArray());
  445. }
  446. }