123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.hssf.usermodel;
-
- import java.util.HashSet;
-
- import org.apache.poi.hssf.record.ExtendedFormatRecord;
- import org.apache.poi.hssf.record.FontRecord;
- import org.apache.poi.hssf.record.StyleRecord;
- import org.apache.poi.hssf.record.common.UnicodeString;
- import org.apache.poi.ss.usermodel.Cell;
- import org.apache.poi.ss.usermodel.CellType;
- import org.apache.poi.ss.usermodel.Row;
-
- /**
- * Excel can get cranky if you give it files containing too
- * many (especially duplicate) objects, and this class can
- * help to avoid those.
- * In general, it's much better to make sure you don't
- * duplicate the objects in your code, as this is likely
- * to be much faster than creating lots and lots of
- * excel objects+records, only to optimise them down to
- * many fewer at a later stage.
- * However, sometimes this is too hard / tricky to do, which
- * is where the use of this class comes in.
- */
- public class HSSFOptimiser {
- /**
- * Goes through the Workbook, optimising the fonts by
- * removing duplicate ones.
- * For now, only works on fonts used in {@link HSSFCellStyle}
- * and {@link HSSFRichTextString}. Any other font uses
- * (eg charts, pictures) may well end up broken!
- * This can be a slow operation, especially if you have
- * lots of cells, cell styles or rich text strings
- * @param workbook The workbook in which to optimise the fonts
- */
- public static void optimiseFonts(HSSFWorkbook workbook) {
- // Where each font has ended up, and if we need to
- // delete the record for it. Start off with no change
- short[] newPos =
- new short[workbook.getWorkbook().getNumberOfFontRecords()+1];
- boolean[] zapRecords = new boolean[newPos.length];
- for(int i=0; i<newPos.length; i++) {
- newPos[i] = (short)i;
- zapRecords[i] = false;
- }
-
- // Get each font record, so we can do deletes
- // without getting confused
- FontRecord[] frecs = new FontRecord[newPos.length];
- for(int i=0; i<newPos.length; i++) {
- // There is no 4!
- if(i == 4) continue;
-
- frecs[i] = workbook.getWorkbook().getFontRecordAt(i);
- }
-
- // Loop over each font, seeing if it is the same
- // as an earlier one. If it is, point users of the
- // later duplicate copy to the earlier one, and
- // mark the later one as needing deleting
- // Note - don't change built in fonts (those before 5)
- for(int i=5; i<newPos.length; i++) {
- // Check this one for being a duplicate
- // of an earlier one
- int earlierDuplicate = -1;
- for(int j=0; j<i && earlierDuplicate == -1; j++) {
- if(j == 4) continue;
-
- FontRecord frCheck = workbook.getWorkbook().getFontRecordAt(j);
- if(frCheck.sameProperties(frecs[i])) {
- earlierDuplicate = j;
- }
- }
-
- // If we got a duplicate, mark it as such
- if(earlierDuplicate != -1) {
- newPos[i] = (short)earlierDuplicate;
- zapRecords[i] = true;
- }
- }
-
- // Update the new positions based on
- // deletes that have occurred between
- // the start and them
- // Only need to worry about user fonts
- for(int i=5; i<newPos.length; i++) {
- // Find the number deleted to that
- // point, and adjust
- short preDeletePos = newPos[i];
- short newPosition = preDeletePos;
- for(int j=0; j<preDeletePos; j++) {
- if(zapRecords[j]) newPosition--;
- }
-
- // Update the new position
- newPos[i] = newPosition;
- }
-
- // Zap the un-needed user font records
- for(int i=5; i<newPos.length; i++) {
- if(zapRecords[i]) {
- workbook.getWorkbook().removeFontRecord(
- frecs[i]
- );
- }
- }
-
- // Tell HSSFWorkbook that it needs to
- // re-start its HSSFFontCache
- workbook.resetFontCache();
-
- // Update the cell styles to point at the
- // new locations of the fonts
- for(int i=0; i<workbook.getWorkbook().getNumExFormats(); i++) {
- ExtendedFormatRecord xfr = workbook.getWorkbook().getExFormatAt(i);
- xfr.setFontIndex(
- newPos[ xfr.getFontIndex() ]
- );
- }
-
- // Update the rich text strings to point at
- // the new locations of the fonts
- // Remember that one underlying unicode string
- // may be shared by multiple RichTextStrings!
- HashSet<UnicodeString> doneUnicodeStrings = new HashSet<>();
- for(int sheetNum=0; sheetNum<workbook.getNumberOfSheets(); sheetNum++) {
- HSSFSheet s = workbook.getSheetAt(sheetNum);
- for (Row row : s) {
- for (Cell cell : row) {
- if(cell.getCellType() == CellType.STRING) {
- HSSFRichTextString rtr = (HSSFRichTextString)cell.getRichStringCellValue();
- UnicodeString u = rtr.getRawUnicodeString();
-
- // Have we done this string already?
- if(! doneUnicodeStrings.contains(u)) {
- // Update for each new position
- for(short i=5; i<newPos.length; i++) {
- if(i != newPos[i]) {
- u.swapFontUse(i, newPos[i]);
- }
- }
-
- // Mark as done
- doneUnicodeStrings.add(u);
- }
- }
- }
- }
- }
- }
-
- /**
- * Goes through the Wokrbook, optimising the cell styles
- * by removing duplicate ones, and ones that aren't used.
- * For best results, optimise the fonts via a call to
- * {@link #optimiseFonts(HSSFWorkbook)} first.
- * @param workbook The workbook in which to optimise the cell styles
- */
- public static void optimiseCellStyles(HSSFWorkbook workbook) {
- // Where each style has ended up, and if we need to
- // delete the record for it. Start off with no change
- short[] newPos = new short[workbook.getWorkbook().getNumExFormats()];
- boolean[] isUsed = new boolean[newPos.length];
- boolean[] zapRecords = new boolean[newPos.length];
- for(int i=0; i<newPos.length; i++) {
- isUsed[i] = false;
- newPos[i] = (short)i;
- zapRecords[i] = false;
- }
-
- // Get each style record, so we can do deletes
- // without getting confused
- ExtendedFormatRecord[] xfrs = new ExtendedFormatRecord[newPos.length];
- for(int i=0; i<newPos.length; i++) {
- xfrs[i] = workbook.getWorkbook().getExFormatAt(i);
- }
-
- // Loop over each style, seeing if it is the same
- // as an earlier one. If it is, point users of the
- // later duplicate copy to the earlier one, and
- // mark the later one as needing deleting
- // Only work on user added ones, which come after 20
- for (int i = 21; i < newPos.length; i++) {
- // Check this one for being a duplicate
- // of an earlier one
- int earlierDuplicate = -1;
- for (int j = 0; j < i && earlierDuplicate == -1; j++) {
- ExtendedFormatRecord xfCheck = workbook.getWorkbook().getExFormatAt(j);
- if (xfCheck.equals(xfrs[i]) &&
- // newer duplicate user defined styles
- !isUserDefined(workbook, j)) {
- earlierDuplicate = j;
- }
- }
-
- // If we got a duplicate, mark it as such
- if(earlierDuplicate != -1) {
- newPos[i] = (short)earlierDuplicate;
- zapRecords[i] = true;
- }
- }
-
- // Loop over all the cells in the file, and identify any user defined
- // styles aren't actually being used (don't touch built-in ones)
- for (int sheetNum = 0; sheetNum < workbook.getNumberOfSheets(); sheetNum++) {
- HSSFSheet s = workbook.getSheetAt(sheetNum);
- for (Row row : s) {
- for (Cell cellI : row) {
- HSSFCell cell = (HSSFCell) cellI;
- short oldXf = cell.getCellValueRecord().getXFIndex();
- // some documents contain invalid values here
- if(oldXf < newPos.length) {
- isUsed[oldXf] = true;
- }
- }
-
- // also mark row style as being used
- short oldXf = ((HSSFRow) row).getRowRecord().getXFIndex();
- // some documents contain invalid values here
- if(oldXf < newPos.length) {
- isUsed[oldXf] = true;
- }
- }
-
- // also mark column styles as being used
- for (int col = s.getSheet().getMinColumnIndex(); col <= s.getSheet().getMaxColumnIndex(); col++) {
- short oldXf = s.getSheet().getXFIndexForColAt((short) col);
- // some documents contain invalid values here
- if(oldXf < newPos.length) {
- isUsed[oldXf] = true;
- }
- }
- }
-
- // Propagate isUsed for duplicates and always set user styles to being used to never optimize them away
- for (int i = 21; i < isUsed.length; i++) {
- // user defined styles are always "used"
- if (isUserDefined(workbook, i)) {
- isUsed[i] = true;
- }
-
- // If we got a duplicate which is used, mark the one we're keeping as used
- if(newPos[i] != i && isUsed[i]) {
- isUsed[newPos[i]] = true;
- }
- }
-
- // Mark any that aren't used as needing zapping
- for (int i=21; i<isUsed.length; i++) {
- if (! isUsed[i]) {
- // Un-used style, can be removed
- zapRecords[i] = true;
- newPos[i] = 0;
- }
- }
-
- // Update the new positions based on
- // deletes that have occurred between
- // the start and them
- // Only work on user added ones, which come after 20
- for(int i=21; i<newPos.length; i++) {
- // Find the number deleted to that
- // point, and adjust
- short preDeletePos = newPos[i];
- short newPosition = preDeletePos;
- for(int j=0; j<preDeletePos; j++) {
- if(zapRecords[j]) newPosition--;
- }
-
- // Update the new position
- newPos[i] = newPosition;
- // also update StyleRecord and Parent-link
- if (i != newPosition && newPosition != 0) {
- workbook.getWorkbook().updateStyleRecord(i, newPosition);
-
- ExtendedFormatRecord exFormat = workbook.getWorkbook().getExFormatAt(i);
- short oldParent = exFormat.getParentIndex();
- // some documents contain invalid values here
- if(oldParent < newPos.length) {
- short newParent = newPos[oldParent];
- exFormat.setParentIndex(newParent);
- }
- }
- }
-
- // Zap the un-needed user style records
- // removing by index, because removing by object may delete
- // styles we did not intend to (the ones that _were_ duplicated and not the duplicates)
- int max = newPos.length;
- int removed = 0; // to adjust index after deletion
- for(int i=21; i<max; i++) {
- if(zapRecords[i + removed]) {
- workbook.getWorkbook().removeExFormatRecord(i);
- i--;
- max--;
- removed++;
- }
- }
-
- // Finally, update the cells to point at their new extended format records
- for (int sheetNum = 0; sheetNum < workbook.getNumberOfSheets(); sheetNum++) {
- HSSFSheet s = workbook.getSheetAt(sheetNum);
- for (Row row : s) {
- for (Cell cell : row) {
- short oldXf = ((HSSFCell) cell).getCellValueRecord().getXFIndex();
- // some documents contain invalid values here
- if(oldXf >= newPos.length) {
- continue;
- }
- HSSFCellStyle newStyle = workbook.getCellStyleAt(newPos[oldXf]);
- cell.setCellStyle(newStyle);
- }
-
- // adjust row column style
- short oldXf = ((HSSFRow) row).getRowRecord().getXFIndex();
- // some documents contain invalid values here
- if(oldXf >= newPos.length) {
- continue;
- }
- HSSFCellStyle newStyle = workbook.getCellStyleAt(newPos[oldXf]);
- row.setRowStyle(newStyle);
- }
-
- // adjust cell column style
- for (int col = s.getSheet().getMinColumnIndex(); col <= s.getSheet().getMaxColumnIndex(); col++) {
- short oldXf = s.getSheet().getXFIndexForColAt((short) col);
- // some documents contain invalid values here
- if(oldXf >= newPos.length) {
- continue;
- }
- HSSFCellStyle newStyle = workbook.getCellStyleAt(newPos[oldXf]);
- s.setDefaultColumnStyle(col, newStyle);
- }
- }
- }
-
- private static boolean isUserDefined(HSSFWorkbook workbook, int index) {
- StyleRecord styleRecord = workbook.getWorkbook().getStyleRecord(index);
- return styleRecord != null &&
- !styleRecord.isBuiltin() &&
- styleRecord.getName() != null;
- }
- }
|