You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

HSSFOptimiser.java 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.usermodel;
  16. import java.util.HashSet;
  17. import org.apache.poi.hssf.record.ExtendedFormatRecord;
  18. import org.apache.poi.hssf.record.FontRecord;
  19. import org.apache.poi.hssf.record.StyleRecord;
  20. import org.apache.poi.hssf.record.common.UnicodeString;
  21. import org.apache.poi.ss.usermodel.Cell;
  22. import org.apache.poi.ss.usermodel.CellType;
  23. import org.apache.poi.ss.usermodel.Row;
  24. /**
  25. * Excel can get cranky if you give it files containing too
  26. * many (especially duplicate) objects, and this class can
  27. * help to avoid those.
  28. * In general, it's much better to make sure you don't
  29. * duplicate the objects in your code, as this is likely
  30. * to be much faster than creating lots and lots of
  31. * excel objects+records, only to optimise them down to
  32. * many fewer at a later stage.
  33. * However, sometimes this is too hard / tricky to do, which
  34. * is where the use of this class comes in.
  35. */
  36. public class HSSFOptimiser {
  37. /**
  38. * Goes through the Workbook, optimising the fonts by
  39. * removing duplicate ones.
  40. * For now, only works on fonts used in {@link HSSFCellStyle}
  41. * and {@link HSSFRichTextString}. Any other font uses
  42. * (eg charts, pictures) may well end up broken!
  43. * This can be a slow operation, especially if you have
  44. * lots of cells, cell styles or rich text strings
  45. * @param workbook The workbook in which to optimise the fonts
  46. */
  47. public static void optimiseFonts(HSSFWorkbook workbook) {
  48. // Where each font has ended up, and if we need to
  49. // delete the record for it. Start off with no change
  50. short[] newPos =
  51. new short[workbook.getWorkbook().getNumberOfFontRecords()+1];
  52. boolean[] zapRecords = new boolean[newPos.length];
  53. for(int i=0; i<newPos.length; i++) {
  54. newPos[i] = (short)i;
  55. zapRecords[i] = false;
  56. }
  57. // Get each font record, so we can do deletes
  58. // without getting confused
  59. FontRecord[] frecs = new FontRecord[newPos.length];
  60. for(int i=0; i<newPos.length; i++) {
  61. // There is no 4!
  62. if(i == 4) continue;
  63. frecs[i] = workbook.getWorkbook().getFontRecordAt(i);
  64. }
  65. // Loop over each font, seeing if it is the same
  66. // as an earlier one. If it is, point users of the
  67. // later duplicate copy to the earlier one, and
  68. // mark the later one as needing deleting
  69. // Note - don't change built in fonts (those before 5)
  70. for(int i=5; i<newPos.length; i++) {
  71. // Check this one for being a duplicate
  72. // of an earlier one
  73. int earlierDuplicate = -1;
  74. for(int j=0; j<i && earlierDuplicate == -1; j++) {
  75. if(j == 4) continue;
  76. FontRecord frCheck = workbook.getWorkbook().getFontRecordAt(j);
  77. if(frCheck.sameProperties(frecs[i])) {
  78. earlierDuplicate = j;
  79. }
  80. }
  81. // If we got a duplicate, mark it as such
  82. if(earlierDuplicate != -1) {
  83. newPos[i] = (short)earlierDuplicate;
  84. zapRecords[i] = true;
  85. }
  86. }
  87. // Update the new positions based on
  88. // deletes that have occurred between
  89. // the start and them
  90. // Only need to worry about user fonts
  91. for(int i=5; i<newPos.length; i++) {
  92. // Find the number deleted to that
  93. // point, and adjust
  94. short preDeletePos = newPos[i];
  95. short newPosition = preDeletePos;
  96. for(int j=0; j<preDeletePos; j++) {
  97. if(zapRecords[j]) newPosition--;
  98. }
  99. // Update the new position
  100. newPos[i] = newPosition;
  101. }
  102. // Zap the un-needed user font records
  103. for(int i=5; i<newPos.length; i++) {
  104. if(zapRecords[i]) {
  105. workbook.getWorkbook().removeFontRecord(
  106. frecs[i]
  107. );
  108. }
  109. }
  110. // Tell HSSFWorkbook that it needs to
  111. // re-start its HSSFFontCache
  112. workbook.resetFontCache();
  113. // Update the cell styles to point at the
  114. // new locations of the fonts
  115. for(int i=0; i<workbook.getWorkbook().getNumExFormats(); i++) {
  116. ExtendedFormatRecord xfr = workbook.getWorkbook().getExFormatAt(i);
  117. xfr.setFontIndex(
  118. newPos[ xfr.getFontIndex() ]
  119. );
  120. }
  121. // Update the rich text strings to point at
  122. // the new locations of the fonts
  123. // Remember that one underlying unicode string
  124. // may be shared by multiple RichTextStrings!
  125. HashSet<UnicodeString> doneUnicodeStrings = new HashSet<>();
  126. for(int sheetNum=0; sheetNum<workbook.getNumberOfSheets(); sheetNum++) {
  127. HSSFSheet s = workbook.getSheetAt(sheetNum);
  128. for (Row row : s) {
  129. for (Cell cell : row) {
  130. if(cell.getCellType() == CellType.STRING) {
  131. HSSFRichTextString rtr = (HSSFRichTextString)cell.getRichStringCellValue();
  132. UnicodeString u = rtr.getRawUnicodeString();
  133. // Have we done this string already?
  134. if(! doneUnicodeStrings.contains(u)) {
  135. // Update for each new position
  136. for(short i=5; i<newPos.length; i++) {
  137. if(i != newPos[i]) {
  138. u.swapFontUse(i, newPos[i]);
  139. }
  140. }
  141. // Mark as done
  142. doneUnicodeStrings.add(u);
  143. }
  144. }
  145. }
  146. }
  147. }
  148. }
  149. /**
  150. * Goes through the Wokrbook, optimising the cell styles
  151. * by removing duplicate ones, and ones that aren't used.
  152. * For best results, optimise the fonts via a call to
  153. * {@link #optimiseFonts(HSSFWorkbook)} first.
  154. * @param workbook The workbook in which to optimise the cell styles
  155. */
  156. public static void optimiseCellStyles(HSSFWorkbook workbook) {
  157. // Where each style has ended up, and if we need to
  158. // delete the record for it. Start off with no change
  159. short[] newPos = new short[workbook.getWorkbook().getNumExFormats()];
  160. boolean[] isUsed = new boolean[newPos.length];
  161. boolean[] zapRecords = new boolean[newPos.length];
  162. for(int i=0; i<newPos.length; i++) {
  163. isUsed[i] = false;
  164. newPos[i] = (short)i;
  165. zapRecords[i] = false;
  166. }
  167. // Get each style record, so we can do deletes
  168. // without getting confused
  169. ExtendedFormatRecord[] xfrs = new ExtendedFormatRecord[newPos.length];
  170. for(int i=0; i<newPos.length; i++) {
  171. xfrs[i] = workbook.getWorkbook().getExFormatAt(i);
  172. }
  173. // Loop over each style, seeing if it is the same
  174. // as an earlier one. If it is, point users of the
  175. // later duplicate copy to the earlier one, and
  176. // mark the later one as needing deleting
  177. // Only work on user added ones, which come after 20
  178. for (int i = 21; i < newPos.length; i++) {
  179. // Check this one for being a duplicate
  180. // of an earlier one
  181. int earlierDuplicate = -1;
  182. for (int j = 0; j < i && earlierDuplicate == -1; j++) {
  183. ExtendedFormatRecord xfCheck = workbook.getWorkbook().getExFormatAt(j);
  184. if (xfCheck.equals(xfrs[i]) &&
  185. // newer duplicate user defined styles
  186. !isUserDefined(workbook, j)) {
  187. earlierDuplicate = j;
  188. }
  189. }
  190. // If we got a duplicate, mark it as such
  191. if(earlierDuplicate != -1) {
  192. newPos[i] = (short)earlierDuplicate;
  193. zapRecords[i] = true;
  194. }
  195. }
  196. // Loop over all the cells in the file, and identify any user defined
  197. // styles aren't actually being used (don't touch built-in ones)
  198. for (int sheetNum = 0; sheetNum < workbook.getNumberOfSheets(); sheetNum++) {
  199. HSSFSheet s = workbook.getSheetAt(sheetNum);
  200. for (Row row : s) {
  201. for (Cell cellI : row) {
  202. HSSFCell cell = (HSSFCell) cellI;
  203. short oldXf = cell.getCellValueRecord().getXFIndex();
  204. // some documents contain invalid values here
  205. if(oldXf < newPos.length) {
  206. isUsed[oldXf] = true;
  207. }
  208. }
  209. // also mark row style as being used
  210. short oldXf = ((HSSFRow) row).getRowRecord().getXFIndex();
  211. // some documents contain invalid values here
  212. if(oldXf < newPos.length) {
  213. isUsed[oldXf] = true;
  214. }
  215. }
  216. // also mark column styles as being used
  217. for (int col = s.getSheet().getMinColumnIndex(); col <= s.getSheet().getMaxColumnIndex(); col++) {
  218. short oldXf = s.getSheet().getXFIndexForColAt((short) col);
  219. // some documents contain invalid values here
  220. if(oldXf < newPos.length) {
  221. isUsed[oldXf] = true;
  222. }
  223. }
  224. }
  225. // Propagate isUsed for duplicates and always set user styles to being used to never optimize them away
  226. for (int i = 21; i < isUsed.length; i++) {
  227. // user defined styles are always "used"
  228. if (isUserDefined(workbook, i)) {
  229. isUsed[i] = true;
  230. }
  231. // If we got a duplicate which is used, mark the one we're keeping as used
  232. if(newPos[i] != i && isUsed[i]) {
  233. isUsed[newPos[i]] = true;
  234. }
  235. }
  236. // Mark any that aren't used as needing zapping
  237. for (int i=21; i<isUsed.length; i++) {
  238. if (! isUsed[i]) {
  239. // Un-used style, can be removed
  240. zapRecords[i] = true;
  241. newPos[i] = 0;
  242. }
  243. }
  244. // Update the new positions based on
  245. // deletes that have occurred between
  246. // the start and them
  247. // Only work on user added ones, which come after 20
  248. for(int i=21; i<newPos.length; i++) {
  249. // Find the number deleted to that
  250. // point, and adjust
  251. short preDeletePos = newPos[i];
  252. short newPosition = preDeletePos;
  253. for(int j=0; j<preDeletePos; j++) {
  254. if(zapRecords[j]) newPosition--;
  255. }
  256. // Update the new position
  257. newPos[i] = newPosition;
  258. // also update StyleRecord and Parent-link
  259. if (i != newPosition && newPosition != 0) {
  260. workbook.getWorkbook().updateStyleRecord(i, newPosition);
  261. ExtendedFormatRecord exFormat = workbook.getWorkbook().getExFormatAt(i);
  262. short oldParent = exFormat.getParentIndex();
  263. // some documents contain invalid values here
  264. if(oldParent < newPos.length) {
  265. short newParent = newPos[oldParent];
  266. exFormat.setParentIndex(newParent);
  267. }
  268. }
  269. }
  270. // Zap the un-needed user style records
  271. // removing by index, because removing by object may delete
  272. // styles we did not intend to (the ones that _were_ duplicated and not the duplicates)
  273. int max = newPos.length;
  274. int removed = 0; // to adjust index after deletion
  275. for(int i=21; i<max; i++) {
  276. if(zapRecords[i + removed]) {
  277. workbook.getWorkbook().removeExFormatRecord(i);
  278. i--;
  279. max--;
  280. removed++;
  281. }
  282. }
  283. // Finally, update the cells to point at their new extended format records
  284. for (int sheetNum = 0; sheetNum < workbook.getNumberOfSheets(); sheetNum++) {
  285. HSSFSheet s = workbook.getSheetAt(sheetNum);
  286. for (Row row : s) {
  287. for (Cell cell : row) {
  288. short oldXf = ((HSSFCell) cell).getCellValueRecord().getXFIndex();
  289. // some documents contain invalid values here
  290. if(oldXf >= newPos.length) {
  291. continue;
  292. }
  293. HSSFCellStyle newStyle = workbook.getCellStyleAt(newPos[oldXf]);
  294. cell.setCellStyle(newStyle);
  295. }
  296. // adjust row column style
  297. short oldXf = ((HSSFRow) row).getRowRecord().getXFIndex();
  298. // some documents contain invalid values here
  299. if(oldXf >= newPos.length) {
  300. continue;
  301. }
  302. HSSFCellStyle newStyle = workbook.getCellStyleAt(newPos[oldXf]);
  303. row.setRowStyle(newStyle);
  304. }
  305. // adjust cell column style
  306. for (int col = s.getSheet().getMinColumnIndex(); col <= s.getSheet().getMaxColumnIndex(); col++) {
  307. short oldXf = s.getSheet().getXFIndexForColAt((short) col);
  308. // some documents contain invalid values here
  309. if(oldXf >= newPos.length) {
  310. continue;
  311. }
  312. HSSFCellStyle newStyle = workbook.getCellStyleAt(newPos[oldXf]);
  313. s.setDefaultColumnStyle(col, newStyle);
  314. }
  315. }
  316. }
  317. private static boolean isUserDefined(HSSFWorkbook workbook, int index) {
  318. StyleRecord styleRecord = workbook.getWorkbook().getStyleRecord(index);
  319. return styleRecord != null &&
  320. !styleRecord.isBuiltin() &&
  321. styleRecord.getName() != null;
  322. }
  323. }