You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

CHPBinTable.java 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.model;
  16. import java.io.ByteArrayOutputStream;
  17. import java.io.IOException;
  18. import java.util.ArrayList;
  19. import java.util.Arrays;
  20. import java.util.Collections;
  21. import java.util.Comparator;
  22. import java.util.HashSet;
  23. import java.util.IdentityHashMap;
  24. import java.util.Iterator;
  25. import java.util.LinkedList;
  26. import java.util.List;
  27. import java.util.Map;
  28. import java.util.Set;
  29. import org.apache.poi.hwpf.model.io.HWPFFileSystem;
  30. import org.apache.poi.hwpf.sprm.SprmBuffer;
  31. import org.apache.poi.hwpf.sprm.SprmIterator;
  32. import org.apache.poi.hwpf.sprm.SprmOperation;
  33. import org.apache.poi.poifs.common.POIFSConstants;
  34. import org.apache.poi.util.Internal;
  35. import org.apache.poi.util.LittleEndian;
  36. import org.apache.poi.util.POILogFactory;
  37. import org.apache.poi.util.POILogger;
  38. /**
  39. * This class holds all of the character formatting properties.
  40. *
  41. * @author Ryan Ackley
  42. */
  43. @Internal
  44. public class CHPBinTable
  45. {
  46. private static final POILogger logger = POILogFactory
  47. .getLogger( CHPBinTable.class );
  48. /** List of character properties.*/
  49. protected List<CHPX> _textRuns = new ArrayList<CHPX>();
  50. public CHPBinTable()
  51. {
  52. }
  53. /**
  54. * Constructor used to read a binTable in from a Word document.
  55. *
  56. * @deprecated Use
  57. * {@link #CHPBinTable(byte[], byte[], int, int, CharIndexTranslator)}
  58. * instead
  59. */
  60. public CHPBinTable( byte[] documentStream, byte[] tableStream, int offset,
  61. int size, int fcMin, TextPieceTable tpt )
  62. {
  63. this( documentStream, tableStream, offset, size, tpt );
  64. }
  65. /**
  66. * Constructor used to read a binTable in from a Word document.
  67. */
  68. public CHPBinTable( byte[] documentStream, byte[] tableStream, int offset,
  69. int size, CharIndexTranslator translator )
  70. {
  71. long start = System.currentTimeMillis();
  72. /*
  73. * Page 35:
  74. *
  75. * "Associated with each interval is a BTE. A BTE holds a four-byte PN
  76. * (page number) which identifies the FKP page in the file which
  77. * contains the formatting information for that interval. A CHPX FKP
  78. * further partitions an interval into runs of exception text."
  79. */
  80. PlexOfCps bte = new PlexOfCps( tableStream, offset, size, 4 );
  81. int length = bte.length();
  82. for (int x = 0; x < length; x++)
  83. {
  84. GenericPropertyNode node = bte.getProperty(x);
  85. int pageNum = LittleEndian.getInt(node.getBytes());
  86. int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
  87. CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
  88. pageOffset, translator);
  89. for ( CHPX chpx : cfkp.getCHPXs() )
  90. {
  91. if ( chpx != null )
  92. _textRuns.add( chpx );
  93. }
  94. }
  95. logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
  96. Long.valueOf( System.currentTimeMillis() - start ), " ms (",
  97. Integer.valueOf( _textRuns.size() ), " elements)" );
  98. if ( _textRuns.isEmpty() )
  99. {
  100. logger.log( POILogger.WARN, "CHPX FKPs are empty" );
  101. _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
  102. }
  103. }
  104. public void rebuild( ComplexFileTable complexFileTable )
  105. {
  106. long start = System.currentTimeMillis();
  107. if ( complexFileTable != null )
  108. {
  109. SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();
  110. // adding CHPX from fast-saved SPRMs
  111. for ( TextPiece textPiece : complexFileTable.getTextPieceTable()
  112. .getTextPieces() )
  113. {
  114. PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
  115. if ( !prm.isComplex() )
  116. continue;
  117. int igrpprl = prm.getIgrpprl();
  118. if ( igrpprl < 0 || igrpprl >= sprmBuffers.length )
  119. {
  120. logger.log( POILogger.WARN, textPiece
  121. + "'s PRM references to unknown grpprl" );
  122. continue;
  123. }
  124. boolean hasChp = false;
  125. SprmBuffer sprmBuffer = sprmBuffers[igrpprl];
  126. for ( SprmIterator iterator = sprmBuffer.iterator(); iterator
  127. .hasNext(); )
  128. {
  129. SprmOperation sprmOperation = iterator.next();
  130. if ( sprmOperation.getType() == SprmOperation.TYPE_CHP )
  131. {
  132. hasChp = true;
  133. break;
  134. }
  135. }
  136. if ( hasChp )
  137. {
  138. SprmBuffer newSprmBuffer = sprmBuffer.clone();
  139. CHPX chpx = new CHPX( textPiece.getStart(),
  140. textPiece.getEnd(), newSprmBuffer );
  141. _textRuns.add( chpx );
  142. }
  143. }
  144. logger.log( POILogger.DEBUG,
  145. "Merged with CHPX from complex file table in ",
  146. Long.valueOf( System.currentTimeMillis() - start ),
  147. " ms (", Integer.valueOf( _textRuns.size() ),
  148. " elements in total)" );
  149. start = System.currentTimeMillis();
  150. }
  151. List<CHPX> oldChpxSortedByStartPos = new ArrayList<CHPX>( _textRuns );
  152. Collections.sort( oldChpxSortedByStartPos,
  153. PropertyNode.StartComparator.instance );
  154. logger.log( POILogger.DEBUG, "CHPX sorted by start position in ",
  155. Long.valueOf( System.currentTimeMillis() - start ), " ms" );
  156. start = System.currentTimeMillis();
  157. final Map<CHPX, Integer> chpxToFileOrder = new IdentityHashMap<CHPX, Integer>();
  158. {
  159. int counter = 0;
  160. for ( CHPX chpx : _textRuns )
  161. {
  162. chpxToFileOrder.put( chpx, Integer.valueOf( counter++ ) );
  163. }
  164. }
  165. final Comparator<CHPX> chpxFileOrderComparator = new Comparator<CHPX>()
  166. {
  167. public int compare( CHPX o1, CHPX o2 )
  168. {
  169. Integer i1 = chpxToFileOrder.get( o1 );
  170. Integer i2 = chpxToFileOrder.get( o2 );
  171. return i1.compareTo( i2 );
  172. }
  173. };
  174. logger.log( POILogger.DEBUG, "CHPX's order map created in ",
  175. Long.valueOf( System.currentTimeMillis() - start ), " ms" );
  176. start = System.currentTimeMillis();
  177. List<Integer> textRunsBoundariesList;
  178. {
  179. Set<Integer> textRunsBoundariesSet = new HashSet<Integer>();
  180. for ( CHPX chpx : _textRuns )
  181. {
  182. textRunsBoundariesSet.add( Integer.valueOf( chpx.getStart() ) );
  183. textRunsBoundariesSet.add( Integer.valueOf( chpx.getEnd() ) );
  184. }
  185. textRunsBoundariesSet.remove( Integer.valueOf( 0 ) );
  186. textRunsBoundariesList = new ArrayList<Integer>(
  187. textRunsBoundariesSet );
  188. Collections.sort( textRunsBoundariesList );
  189. }
  190. logger.log( POILogger.DEBUG, "Texts CHPX boundaries collected in ",
  191. Long.valueOf( System.currentTimeMillis() - start ), " ms" );
  192. start = System.currentTimeMillis();
  193. List<CHPX> newChpxs = new LinkedList<CHPX>();
  194. int lastTextRunStart = 0;
  195. for ( Integer objBoundary : textRunsBoundariesList )
  196. {
  197. final int boundary = objBoundary.intValue();
  198. final int startInclusive = lastTextRunStart;
  199. final int endExclusive = boundary;
  200. lastTextRunStart = endExclusive;
  201. int startPosition = binarySearch( oldChpxSortedByStartPos, boundary );
  202. startPosition = Math.abs( startPosition );
  203. while ( startPosition >= oldChpxSortedByStartPos.size() )
  204. startPosition--;
  205. while ( startPosition > 0
  206. && oldChpxSortedByStartPos.get( startPosition ).getStart() >= boundary )
  207. startPosition--;
  208. List<CHPX> chpxs = new LinkedList<CHPX>();
  209. for ( int c = startPosition; c < oldChpxSortedByStartPos.size(); c++ )
  210. {
  211. CHPX chpx = oldChpxSortedByStartPos.get( c );
  212. if ( boundary < chpx.getStart() )
  213. break;
  214. int left = Math.max( startInclusive, chpx.getStart() );
  215. int right = Math.min( endExclusive, chpx.getEnd() );
  216. if ( left < right )
  217. {
  218. chpxs.add( chpx );
  219. }
  220. }
  221. if ( chpxs.size() == 0 )
  222. {
  223. logger.log( POILogger.WARN, "Text piece [",
  224. Integer.valueOf( startInclusive ), "; ",
  225. Integer.valueOf( endExclusive ),
  226. ") has no CHPX. Creating new one." );
  227. // create it manually
  228. CHPX chpx = new CHPX( startInclusive, endExclusive,
  229. new SprmBuffer( 0 ) );
  230. newChpxs.add( chpx );
  231. continue;
  232. }
  233. if ( chpxs.size() == 1 )
  234. {
  235. // can we reuse existing?
  236. CHPX existing = chpxs.get( 0 );
  237. if ( existing.getStart() == startInclusive
  238. && existing.getEnd() == endExclusive )
  239. {
  240. newChpxs.add( existing );
  241. continue;
  242. }
  243. }
  244. Collections.sort( chpxs, chpxFileOrderComparator );
  245. SprmBuffer sprmBuffer = new SprmBuffer( 0 );
  246. for ( CHPX chpx : chpxs )
  247. {
  248. sprmBuffer.append( chpx.getGrpprl(), 0 );
  249. }
  250. CHPX newChpx = new CHPX( startInclusive, endExclusive, sprmBuffer );
  251. newChpxs.add( newChpx );
  252. continue;
  253. }
  254. this._textRuns = new ArrayList<CHPX>( newChpxs );
  255. logger.log( POILogger.DEBUG, "CHPX rebuilded in ",
  256. Long.valueOf( System.currentTimeMillis() - start ), " ms (",
  257. Integer.valueOf( _textRuns.size() ), " elements)" );
  258. start = System.currentTimeMillis();
  259. CHPX previous = null;
  260. for ( Iterator<CHPX> iterator = _textRuns.iterator(); iterator
  261. .hasNext(); )
  262. {
  263. CHPX current = iterator.next();
  264. if ( previous == null )
  265. {
  266. previous = current;
  267. continue;
  268. }
  269. if ( previous.getEnd() == current.getStart()
  270. && Arrays
  271. .equals( previous.getGrpprl(), current.getGrpprl() ) )
  272. {
  273. previous.setEnd( current.getEnd() );
  274. iterator.remove();
  275. continue;
  276. }
  277. previous = current;
  278. }
  279. logger.log( POILogger.DEBUG, "CHPX compacted in ",
  280. Long.valueOf( System.currentTimeMillis() - start ), " ms (",
  281. Integer.valueOf( _textRuns.size() ), " elements)" );
  282. }
  283. private static int binarySearch( List<CHPX> chpxs, int startPosition )
  284. {
  285. int low = 0;
  286. int high = chpxs.size() - 1;
  287. while ( low <= high )
  288. {
  289. int mid = ( low + high ) >>> 1;
  290. CHPX midVal = chpxs.get( mid );
  291. int midValue = midVal.getStart();
  292. if ( midValue < startPosition )
  293. low = mid + 1;
  294. else if ( midValue > startPosition )
  295. high = mid - 1;
  296. else
  297. return mid; // key found
  298. }
  299. return -( low + 1 ); // key not found.
  300. }
  301. public void adjustForDelete(int listIndex, int offset, int length)
  302. {
  303. int size = _textRuns.size();
  304. int endMark = offset + length;
  305. int endIndex = listIndex;
  306. CHPX chpx = _textRuns.get(endIndex);
  307. while (chpx.getEnd() < endMark)
  308. {
  309. chpx = _textRuns.get(++endIndex);
  310. }
  311. if (listIndex == endIndex)
  312. {
  313. chpx = _textRuns.get(endIndex);
  314. chpx.setEnd((chpx.getEnd() - endMark) + offset);
  315. }
  316. else
  317. {
  318. chpx = _textRuns.get(listIndex);
  319. chpx.setEnd(offset);
  320. for (int x = listIndex + 1; x < endIndex; x++)
  321. {
  322. chpx = _textRuns.get(x);
  323. chpx.setStart(offset);
  324. chpx.setEnd(offset);
  325. }
  326. chpx = _textRuns.get(endIndex);
  327. chpx.setEnd((chpx.getEnd() - endMark) + offset);
  328. }
  329. for (int x = endIndex + 1; x < size; x++)
  330. {
  331. chpx = _textRuns.get(x);
  332. chpx.setStart(chpx.getStart() - length);
  333. chpx.setEnd(chpx.getEnd() - length);
  334. }
  335. }
  336. public void insert(int listIndex, int cpStart, SprmBuffer buf)
  337. {
  338. CHPX insertChpx = new CHPX(0, 0, buf);
  339. // Ensure character offsets are really characters
  340. insertChpx.setStart(cpStart);
  341. insertChpx.setEnd(cpStart);
  342. if (listIndex == _textRuns.size())
  343. {
  344. _textRuns.add(insertChpx);
  345. }
  346. else
  347. {
  348. CHPX chpx = _textRuns.get(listIndex);
  349. if (chpx.getStart() < cpStart)
  350. {
  351. // Copy the properties of the one before to afterwards
  352. // Will go:
  353. // Original, until insert at point
  354. // New one
  355. // Clone of original, on to the old end
  356. CHPX clone = new CHPX(0, 0, chpx.getSprmBuf());
  357. // Again ensure contains character based offsets no matter what
  358. clone.setStart(cpStart);
  359. clone.setEnd(chpx.getEnd());
  360. chpx.setEnd(cpStart);
  361. _textRuns.add(listIndex + 1, insertChpx);
  362. _textRuns.add(listIndex + 2, clone);
  363. }
  364. else
  365. {
  366. _textRuns.add(listIndex, insertChpx);
  367. }
  368. }
  369. }
  370. public void adjustForInsert(int listIndex, int length)
  371. {
  372. int size = _textRuns.size();
  373. CHPX chpx = _textRuns.get(listIndex);
  374. chpx.setEnd(chpx.getEnd() + length);
  375. for (int x = listIndex + 1; x < size; x++)
  376. {
  377. chpx = _textRuns.get(x);
  378. chpx.setStart(chpx.getStart() + length);
  379. chpx.setEnd(chpx.getEnd() + length);
  380. }
  381. }
  382. public List<CHPX> getTextRuns()
  383. {
  384. return _textRuns;
  385. }
  386. @Deprecated
  387. public void writeTo( HWPFFileSystem sys, int fcMin,
  388. CharIndexTranslator translator ) throws IOException
  389. {
  390. ByteArrayOutputStream docStream = sys.getStream( "WordDocument" );
  391. ByteArrayOutputStream tableStream = sys.getStream( "1Table" );
  392. writeTo( docStream, tableStream, fcMin, translator );
  393. }
  394. public void writeTo( ByteArrayOutputStream wordDocumentStream,
  395. ByteArrayOutputStream tableStream, int fcMin,
  396. CharIndexTranslator translator ) throws IOException
  397. {
  398. /*
  399. * Page 35:
  400. *
  401. * "Associated with each interval is a BTE. A BTE holds a four-byte PN
  402. * (page number) which identifies the FKP page in the file which
  403. * contains the formatting information for that interval. A CHPX FKP
  404. * further partitions an interval into runs of exception text."
  405. */
  406. PlexOfCps bte = new PlexOfCps( 4 );
  407. // each FKP must start on a 512 byte page.
  408. int docOffset = wordDocumentStream.size();
  409. int mod = docOffset % POIFSConstants.SMALLER_BIG_BLOCK_SIZE;
  410. if (mod != 0)
  411. {
  412. byte[] padding = new byte[POIFSConstants.SMALLER_BIG_BLOCK_SIZE - mod];
  413. wordDocumentStream.write(padding);
  414. }
  415. // get the page number for the first fkp
  416. docOffset = wordDocumentStream.size();
  417. int pageNum = docOffset/POIFSConstants.SMALLER_BIG_BLOCK_SIZE;
  418. // get the ending fc
  419. // CHPX lastRun = _textRuns.get(_textRuns.size() - 1);
  420. // int endingFc = lastRun.getEnd();
  421. // endingFc += fcMin;
  422. int endingFc = translator.getByteIndex( _textRuns.get(
  423. _textRuns.size() - 1 ).getEnd() );
  424. List<CHPX> overflow = _textRuns;
  425. do
  426. {
  427. CHPX startingProp = overflow.get(0);
  428. // int start = startingProp.getStart() + fcMin;
  429. int start = translator.getByteIndex( startingProp.getStart() );
  430. CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage();
  431. cfkp.fill(overflow);
  432. byte[] bufFkp = cfkp.toByteArray( translator );
  433. wordDocumentStream.write(bufFkp);
  434. overflow = cfkp.getOverflow();
  435. int end = endingFc;
  436. if (overflow != null)
  437. {
  438. // end = overflow.get(0).getStart() + fcMin;
  439. end = translator.getByteIndex( overflow.get( 0 ).getStart() );
  440. }
  441. byte[] intHolder = new byte[4];
  442. LittleEndian.putInt(intHolder, 0, pageNum++);
  443. bte.addProperty(new GenericPropertyNode(start, end, intHolder));
  444. }
  445. while (overflow != null);
  446. tableStream.write(bte.toByteArray());
  447. }
  448. }