You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PAPBinTable.java 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.model;
  16. import java.io.IOException;
  17. import java.util.ArrayList;
  18. import java.util.Collections;
  19. import java.util.Comparator;
  20. import java.util.IdentityHashMap;
  21. import java.util.LinkedList;
  22. import java.util.List;
  23. import java.util.Map;
  24. import org.apache.poi.hwpf.model.io.HWPFFileSystem;
  25. import org.apache.poi.hwpf.model.io.HWPFOutputStream;
  26. import org.apache.poi.hwpf.sprm.SprmBuffer;
  27. import org.apache.poi.hwpf.sprm.SprmIterator;
  28. import org.apache.poi.hwpf.sprm.SprmOperation;
  29. import org.apache.poi.poifs.common.POIFSConstants;
  30. import org.apache.poi.util.Internal;
  31. import org.apache.poi.util.LittleEndian;
  32. import org.apache.poi.util.POILogFactory;
  33. import org.apache.poi.util.POILogger;
  34. /**
  35. * This class represents the bin table of Word document but it also serves as a
  36. * holder for all of the paragraphs of document that have been loaded into
  37. * memory.
  38. *
  39. * @author Ryan Ackley
  40. */
  41. @Internal
  42. public class PAPBinTable
  43. {
  44. private static final POILogger logger = POILogFactory
  45. .getLogger( PAPBinTable.class );
  46. protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
  47. public PAPBinTable()
  48. {
  49. }
  50. /**
  51. * @deprecated Use
  52. * {@link #PAPBinTable(byte[], byte[], byte[], int, int, CharIndexTranslator)}
  53. * instead
  54. */
  55. public PAPBinTable( byte[] documentStream, byte[] tableStream,
  56. byte[] dataStream, int offset, int size, int fcMin,
  57. TextPieceTable tpt )
  58. {
  59. this( documentStream, tableStream, dataStream, offset, size, tpt );
  60. }
  61. public PAPBinTable( byte[] documentStream, byte[] tableStream,
  62. byte[] dataStream, int offset, int size,
  63. CharIndexTranslator charIndexTranslator )
  64. {
  65. long start = System.currentTimeMillis();
  66. {
  67. PlexOfCps binTable = new PlexOfCps( tableStream, offset, size, 4 );
  68. int length = binTable.length();
  69. for ( int x = 0; x < length; x++ )
  70. {
  71. GenericPropertyNode node = binTable.getProperty( x );
  72. int pageNum = LittleEndian.getInt( node.getBytes() );
  73. int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
  74. * pageNum;
  75. PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
  76. documentStream, dataStream, pageOffset,
  77. charIndexTranslator );
  78. for ( PAPX papx : pfkp.getPAPXs() )
  79. {
  80. if ( papx != null )
  81. _paragraphs.add( papx );
  82. }
  83. }
  84. }
  85. logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
  86. Long.valueOf( System.currentTimeMillis() - start ), " ms (",
  87. Integer.valueOf( _paragraphs.size() ), " elements)" );
  88. if ( _paragraphs.isEmpty() )
  89. {
  90. logger.log( POILogger.WARN, "PAPX FKPs are empty" );
  91. _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
  92. }
  93. }
  94. public void rebuild( final StringBuilder docText,
  95. ComplexFileTable complexFileTable )
  96. {
  97. rebuild( docText, complexFileTable, _paragraphs );
  98. }
  99. static void rebuild( final StringBuilder docText,
  100. ComplexFileTable complexFileTable, List<PAPX> paragraphs )
  101. {
  102. long start = System.currentTimeMillis();
  103. if ( complexFileTable != null )
  104. {
  105. SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();
  106. // adding PAPX from fast-saved SPRMs
  107. for ( TextPiece textPiece : complexFileTable.getTextPieceTable()
  108. .getTextPieces() )
  109. {
  110. PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
  111. if ( !prm.isComplex() )
  112. continue;
  113. int igrpprl = prm.getIgrpprl();
  114. if ( igrpprl < 0 || igrpprl >= sprmBuffers.length )
  115. {
  116. logger.log( POILogger.WARN, textPiece
  117. + "'s PRM references to unknown grpprl" );
  118. continue;
  119. }
  120. boolean hasPap = false;
  121. SprmBuffer sprmBuffer = sprmBuffers[igrpprl];
  122. for ( SprmIterator iterator = sprmBuffer.iterator(); iterator
  123. .hasNext(); )
  124. {
  125. SprmOperation sprmOperation = iterator.next();
  126. if ( sprmOperation.getType() == SprmOperation.TYPE_PAP )
  127. {
  128. hasPap = true;
  129. break;
  130. }
  131. }
  132. if ( hasPap )
  133. {
  134. SprmBuffer newSprmBuffer = new SprmBuffer( 2 );
  135. newSprmBuffer.append( sprmBuffer.toByteArray() );
  136. PAPX papx = new PAPX( textPiece.getStart(),
  137. textPiece.getEnd(), newSprmBuffer );
  138. paragraphs.add( papx );
  139. }
  140. }
  141. logger.log( POILogger.DEBUG,
  142. "Merged (?) with PAPX from complex file table in ",
  143. Long.valueOf( System.currentTimeMillis() - start ),
  144. " ms (", Integer.valueOf( paragraphs.size() ),
  145. " elements in total)" );
  146. start = System.currentTimeMillis();
  147. }
  148. List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( paragraphs );
  149. Collections.sort( oldPapxSortedByEndPos,
  150. PropertyNode.EndComparator.instance );
  151. logger.log( POILogger.DEBUG, "PAPX sorted by end position in ",
  152. Long.valueOf( System.currentTimeMillis() - start ), " ms" );
  153. start = System.currentTimeMillis();
  154. final Map<PAPX, Integer> papxToFileOrder = new IdentityHashMap<PAPX, Integer>();
  155. {
  156. int counter = 0;
  157. for ( PAPX papx : paragraphs )
  158. {
  159. papxToFileOrder.put( papx, Integer.valueOf( counter++ ) );
  160. }
  161. }
  162. final Comparator<PAPX> papxFileOrderComparator = new Comparator<PAPX>()
  163. {
  164. public int compare( PAPX o1, PAPX o2 )
  165. {
  166. Integer i1 = papxToFileOrder.get( o1 );
  167. Integer i2 = papxToFileOrder.get( o2 );
  168. return i1.compareTo( i2 );
  169. }
  170. };
  171. logger.log( POILogger.DEBUG, "PAPX's order map created in ",
  172. Long.valueOf( System.currentTimeMillis() - start ), " ms" );
  173. start = System.currentTimeMillis();
  174. List<PAPX> newPapxs = new LinkedList<PAPX>();
  175. int lastParStart = 0;
  176. int lastPapxIndex = 0;
  177. for ( int charIndex = 0; charIndex < docText.length(); charIndex++ )
  178. {
  179. final char c = docText.charAt( charIndex );
  180. if ( c != 13 && c != 7 && c != 12 )
  181. continue;
  182. final int startInclusive = lastParStart;
  183. final int endExclusive = charIndex + 1;
  184. boolean broken = false;
  185. List<PAPX> papxs = new LinkedList<PAPX>();
  186. for ( int papxIndex = lastPapxIndex; papxIndex < oldPapxSortedByEndPos
  187. .size(); papxIndex++ )
  188. {
  189. broken = false;
  190. PAPX papx = oldPapxSortedByEndPos.get( papxIndex );
  191. assert startInclusive == 0
  192. || papxIndex + 1 == oldPapxSortedByEndPos.size()
  193. || papx.getEnd() > startInclusive;
  194. if ( papx.getEnd() - 1 > charIndex )
  195. {
  196. lastPapxIndex = papxIndex;
  197. broken = true;
  198. break;
  199. }
  200. papxs.add( papx );
  201. }
  202. if ( !broken )
  203. {
  204. lastPapxIndex = oldPapxSortedByEndPos.size() - 1;
  205. }
  206. if ( papxs.size() == 0 )
  207. {
  208. logger.log( POILogger.WARN, "Paragraph [",
  209. Integer.valueOf( startInclusive ), "; ",
  210. Integer.valueOf( endExclusive ),
  211. ") has no PAPX. Creating new one." );
  212. // create it manually
  213. PAPX papx = new PAPX( startInclusive, endExclusive,
  214. new SprmBuffer( 2 ) );
  215. newPapxs.add( papx );
  216. lastParStart = endExclusive;
  217. continue;
  218. }
  219. if ( papxs.size() == 1 )
  220. {
  221. // can we reuse existing?
  222. PAPX existing = papxs.get( 0 );
  223. if ( existing.getStart() == startInclusive
  224. && existing.getEnd() == endExclusive )
  225. {
  226. newPapxs.add( existing );
  227. lastParStart = endExclusive;
  228. continue;
  229. }
  230. }
  231. // restore file order of PAPX
  232. Collections.sort( papxs, papxFileOrderComparator );
  233. SprmBuffer sprmBuffer = null;
  234. for ( PAPX papx : papxs )
  235. {
  236. if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
  237. continue;
  238. if ( sprmBuffer == null ) {
  239. sprmBuffer = papx.getSprmBuf().clone();
  240. } else {
  241. sprmBuffer.append( papx.getGrpprl(), 2 );
  242. }
  243. }
  244. PAPX newPapx = new PAPX( startInclusive, endExclusive, sprmBuffer );
  245. newPapxs.add( newPapx );
  246. lastParStart = endExclusive;
  247. continue;
  248. }
  249. paragraphs.clear();
  250. paragraphs.addAll( newPapxs );
  251. logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
  252. Long.valueOf( System.currentTimeMillis() - start ), " ms (",
  253. Integer.valueOf( paragraphs.size() ), " elements)" );
  254. start = System.currentTimeMillis();
  255. }
  256. public void insert(int listIndex, int cpStart, SprmBuffer buf)
  257. {
  258. PAPX forInsert = new PAPX(0, 0, buf);
  259. // Ensure character offsets are really characters
  260. forInsert.setStart(cpStart);
  261. forInsert.setEnd(cpStart);
  262. if (listIndex == _paragraphs.size())
  263. {
  264. _paragraphs.add(forInsert);
  265. }
  266. else
  267. {
  268. PAPX currentPap = _paragraphs.get(listIndex);
  269. if (currentPap != null && currentPap.getStart() < cpStart)
  270. {
  271. SprmBuffer clonedBuf = currentPap.getSprmBuf().clone();
  272. // Copy the properties of the one before to afterwards
  273. // Will go:
  274. // Original, until insert at point
  275. // New one
  276. // Clone of original, on to the old end
  277. PAPX clone = new PAPX(0, 0, clonedBuf);
  278. // Again ensure contains character based offsets no matter what
  279. clone.setStart(cpStart);
  280. clone.setEnd(currentPap.getEnd());
  281. currentPap.setEnd(cpStart);
  282. _paragraphs.add(listIndex + 1, forInsert);
  283. _paragraphs.add(listIndex + 2, clone);
  284. }
  285. else
  286. {
  287. _paragraphs.add(listIndex, forInsert);
  288. }
  289. }
  290. }
  291. public void adjustForDelete(int listIndex, int offset, int length)
  292. {
  293. int size = _paragraphs.size();
  294. int endMark = offset + length;
  295. int endIndex = listIndex;
  296. PAPX papx = _paragraphs.get(endIndex);
  297. while (papx.getEnd() < endMark)
  298. {
  299. papx = _paragraphs.get(++endIndex);
  300. }
  301. if (listIndex == endIndex)
  302. {
  303. papx = _paragraphs.get(endIndex);
  304. papx.setEnd((papx.getEnd() - endMark) + offset);
  305. }
  306. else
  307. {
  308. papx = _paragraphs.get(listIndex);
  309. papx.setEnd(offset);
  310. for (int x = listIndex + 1; x < endIndex; x++)
  311. {
  312. papx = _paragraphs.get(x);
  313. papx.setStart(offset);
  314. papx.setEnd(offset);
  315. }
  316. papx = _paragraphs.get(endIndex);
  317. papx.setEnd((papx.getEnd() - endMark) + offset);
  318. }
  319. for (int x = endIndex + 1; x < size; x++)
  320. {
  321. papx = _paragraphs.get(x);
  322. papx.setStart(papx.getStart() - length);
  323. papx.setEnd(papx.getEnd() - length);
  324. }
  325. }
  326. public void adjustForInsert(int listIndex, int length)
  327. {
  328. int size = _paragraphs.size();
  329. PAPX papx = _paragraphs.get(listIndex);
  330. papx.setEnd(papx.getEnd() + length);
  331. for (int x = listIndex + 1; x < size; x++)
  332. {
  333. papx = _paragraphs.get(x);
  334. papx.setStart(papx.getStart() + length);
  335. papx.setEnd(papx.getEnd() + length);
  336. }
  337. }
  338. public ArrayList<PAPX> getParagraphs()
  339. {
  340. return _paragraphs;
  341. }
  342. @Deprecated
  343. public void writeTo( HWPFFileSystem sys, CharIndexTranslator translator )
  344. throws IOException
  345. {
  346. HWPFOutputStream wordDocumentStream = sys.getStream( "WordDocument" );
  347. HWPFOutputStream tableStream = sys.getStream( "1Table" );
  348. writeTo( wordDocumentStream, tableStream, translator );
  349. }
  350. public void writeTo( HWPFOutputStream wordDocumentStream,
  351. HWPFOutputStream tableStream, CharIndexTranslator translator )
  352. throws IOException
  353. {
  354. PlexOfCps binTable = new PlexOfCps(4);
  355. // each FKP must start on a 512 byte page.
  356. int docOffset = wordDocumentStream.getOffset();
  357. int mod = docOffset % POIFSConstants.SMALLER_BIG_BLOCK_SIZE;
  358. if (mod != 0)
  359. {
  360. byte[] padding = new byte[POIFSConstants.SMALLER_BIG_BLOCK_SIZE - mod];
  361. wordDocumentStream.write(padding);
  362. }
  363. // get the page number for the first fkp
  364. docOffset = wordDocumentStream.getOffset();
  365. int pageNum = docOffset/POIFSConstants.SMALLER_BIG_BLOCK_SIZE;
  366. // get the ending fc
  367. // int endingFc = _paragraphs.get(_paragraphs.size() - 1).getEnd();
  368. // endingFc += fcMin;
  369. int endingFc = translator.getByteIndex( _paragraphs.get(
  370. _paragraphs.size() - 1 ).getEnd() );
  371. ArrayList<PAPX> overflow = _paragraphs;
  372. do
  373. {
  374. PAPX startingProp = overflow.get(0);
  375. // int start = startingProp.getStart() + fcMin;
  376. int start = translator.getByteIndex( startingProp.getStart() );
  377. PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage();
  378. pfkp.fill(overflow);
  379. byte[] bufFkp = pfkp.toByteArray(tableStream, translator);
  380. wordDocumentStream.write(bufFkp);
  381. overflow = pfkp.getOverflow();
  382. int end = endingFc;
  383. if (overflow != null)
  384. {
  385. // end = overflow.get(0).getStart() + fcMin;
  386. end = translator.getByteIndex( overflow.get( 0 ).getStart() );
  387. }
  388. byte[] intHolder = new byte[4];
  389. LittleEndian.putInt(intHolder, 0, pageNum++);
  390. binTable.addProperty(new GenericPropertyNode(start, end, intHolder));
  391. }
  392. while (overflow != null);
  393. tableStream.write(binTable.toByteArray());
  394. }
  395. }