Improve the speed of the styles-optimiser when there are many styles in a Workbook

author Dominik Stadler <centic@apache.org>

Tue, 29 Dec 2020 19:29:22 +0000 (19:29 +0000)

committer Dominik Stadler <centic@apache.org>

Tue, 29 Dec 2020 19:29:22 +0000 (19:29 +0000)
author Dominik Stadler <centic@apache.org>
Tue, 29 Dec 2020 19:29:22 +0000 (19:29 +0000)
committer Dominik Stadler <centic@apache.org>
Tue, 29 Dec 2020 19:29:22 +0000 (19:29 +0000)
diff --git a/src/java/org/apache/poi/hssf/usermodel/HSSFOptimiser.java b/src/java/org/apache/poi/hssf/usermodel/HSSFOptimiser.java

index f4287ce4b541a670c5e2c1e1443ed3d5104f8efd..40799029be365ec529c8e63bcd1fa3ca2952d9fb 100644 (file)
--- a/src/java/org/apache/poi/hssf/usermodel/HSSFOptimiser.java
+++ b/src/java/org/apache/poi/hssf/usermodel/HSSFOptimiser.java
@@ -30,7 +30,7 @@ import org.apache.poi.ss.usermodel.Row;
   * Excel can get cranky if you give it files containing too
   *  many (especially duplicate) objects, and this class can
   *  help to avoid those.
- * In general, it's much better to make sure you don't 
+ * In general, it's much better to make sure you don't
   *  duplicate the objects in your code, as this is likely
   *  to be much faster than creating lots and lots of
   *  excel objects+records, only to optimise them down to
@@ -52,27 +52,27 @@ public class HSSFOptimiser {
         public static void optimiseFonts(HSSFWorkbook workbook) {
                 // Where each font has ended up, and if we need to
                 //  delete the record for it. Start off with no change
-               short[] newPos = 
+               short[] newPos =
                         new short[workbook.getWorkbook().getNumberOfFontRecords()+1];
                 boolean[] zapRecords = new boolean[newPos.length];
                 for(int i=0; i<newPos.length; i++) {
                         newPos[i] = (short)i;
                         zapRecords[i] = false;
                 }
-               
+
                 // Get each font record, so we can do deletes
                 //  without getting confused
-               FontRecord[] frecs = new FontRecord[newPos.length]; 
+               FontRecord[] frecs = new FontRecord[newPos.length];
                 for(int i=0; i<newPos.length; i++) {
                         // There is no 4!
                         if(i == 4) continue;
-                       
+
                         frecs[i] = workbook.getWorkbook().getFontRecordAt(i);
                 }
-               
+
                 // Loop over each font, seeing if it is the same
                 //  as an earlier one. If it is, point users of the
-               //  later duplicate copy to the earlier one, and 
+               //  later duplicate copy to the earlier one, and
                 //  mark the later one as needing deleting
                 // Note - don't change built in fonts (those before 5)
                 for(int i=5; i<newPos.length; i++) {
@@ -81,20 +81,20 @@ public class HSSFOptimiser {
                         int earlierDuplicate = -1;
                         for(int j=0; j<i && earlierDuplicate == -1; j++) {
                                 if(j == 4) continue;
-                               
+
                                 FontRecord frCheck = workbook.getWorkbook().getFontRecordAt(j);
                                 if(frCheck.sameProperties(frecs[i])) {
                                         earlierDuplicate = j;
                                 }
                         }
-                       
+
                         // If we got a duplicate, mark it as such
                         if(earlierDuplicate != -1) {
                                 newPos[i] = (short)earlierDuplicate;
                                 zapRecords[i] = true;
                         }
                 }
-               
+
                 // Update the new positions based on
                 //  deletes that have occurred between
                 //  the start and them
@@ -107,11 +107,11 @@ public class HSSFOptimiser {
                         for(int j=0; j<preDeletePos; j++) {
                                 if(zapRecords[j]) newPosition--;
                         }
-                       
+
                         // Update the new position
                         newPos[i] = newPosition;
                 }
-               
+
                 // Zap the un-needed user font records
                 for(int i=5; i<newPos.length; i++) {
                         if(zapRecords[i]) {
@@ -120,12 +120,12 @@ public class HSSFOptimiser {
                                 );
                         }
                 }
-               
+
                 // Tell HSSFWorkbook that it needs to
                 //  re-start its HSSFFontCache
                 workbook.resetFontCache();
-               
-               // Update the cell styles to point at the 
+
+               // Update the cell styles to point at the
                 //  new locations of the fonts
                 for(int i=0; i<workbook.getWorkbook().getNumExFormats(); i++) {
                         ExtendedFormatRecord xfr = workbook.getWorkbook().getExFormatAt(i);
@@ -133,7 +133,7 @@ public class HSSFOptimiser {
                                         newPos[ xfr.getFontIndex() ]
                         );
                 }
-               
+
                 // Update the rich text strings to point at
                 //  the new locations of the fonts
                 // Remember that one underlying unicode string
@@ -146,7 +146,7 @@ public class HSSFOptimiser {
                                         if(cell.getCellType() == CellType.STRING) {
                                                 HSSFRichTextString rtr = (HSSFRichTextString)cell.getRichStringCellValue();
                                                 UnicodeString u = rtr.getRawUnicodeString();
-                                               
+
                                                 // Have we done this string already?
                                                 if(! doneUnicodeStrings.contains(u)) {
                                                         // Update for each new position
@@ -155,7 +155,7 @@ public class HSSFOptimiser {
                                                                         u.swapFontUse(i, newPos[i]);
                                                                 }
                                                         }
-                                                       
+
                                                         // Mark as done
                                                         doneUnicodeStrings.add(u);
                                                 }
@@ -164,7 +164,7 @@ public class HSSFOptimiser {
                         }
                 }
         }
-       
+
     /**
      * Goes through the Wokrbook, optimising the cell styles
      *  by removing duplicate ones, and ones that aren't used.
@@ -178,17 +178,24 @@ public class HSSFOptimiser {
         short[] newPos = new short[workbook.getWorkbook().getNumExFormats()];
         boolean[] isUsed = new boolean[newPos.length];
         boolean[] zapRecords = new boolean[newPos.length];
+
+          // to speed up the optimisation for workbooks with a large number of
+          // styles we perform the isUserDefined() check only once as it is
+          // costly according to some profiling
+          boolean[] userDefined = new boolean[newPos.length];
+
+          // Get each style record, so we can do deletes
+          //  without getting confused
+          ExtendedFormatRecord[] xfrs = new ExtendedFormatRecord[newPos.length];
+
         for(int i=0; i<newPos.length; i++) {
             isUsed[i] = false;
             newPos[i] = (short)i;
             zapRecords[i] = false;
-       }
  
-       // Get each style record, so we can do deletes
-       //  without getting confused
-       ExtendedFormatRecord[] xfrs = new ExtendedFormatRecord[newPos.length];
-       for(int i=0; i<newPos.length; i++) {
-           xfrs[i] = workbook.getWorkbook().getExFormatAt(i);
+                  userDefined[i] = isUserDefined(workbook, i);
+
+                  xfrs[i] = workbook.getWorkbook().getExFormatAt(i);
         }
  
            // Loop over each style, seeing if it is the same
@@ -200,12 +207,13 @@ public class HSSFOptimiser {
                    // Check this one for being a duplicate
                    //  of an earlier one
                    int earlierDuplicate = -1;
-                  for (int j = 0; j < i && earlierDuplicate == -1; j++) {
+                  for (int j = 0; j < i; j++) {
                            ExtendedFormatRecord xfCheck = workbook.getWorkbook().getExFormatAt(j);
                            if (xfCheck.equals(xfrs[i]) &&
-                                          // newer duplicate user defined styles
-                                          !isUserDefined(workbook, j)) {
+                                          // never duplicate user defined styles
+                                          !userDefined[j]) {
                                    earlierDuplicate = j;
+                                  break;
                            }
                    }
  
diff --git a/test-data/spreadsheet/styles-3563.xls b/test-data/spreadsheet/styles-3563.xls

new file mode 100644 (file)

index 0000000..05376aa

Binary files /dev/null and b/test-data/spreadsheet/styles-3563.xls differ
author	Dominik Stadler <centic@apache.org>
	Tue, 29 Dec 2020 19:29:22 +0000 (19:29 +0000)
committer	Dominik Stadler <centic@apache.org>
	Tue, 29 Dec 2020 19:29:22 +0000 (19:29 +0000)
src/java/org/apache/poi/hssf/usermodel/HSSFOptimiser.java		patch \| blob \| history
test-data/spreadsheet/styles-3563.xls	[new file with mode: 0644]	patch \| blob