]> source.dussan.org Git - poi.git/commitdiff
bug 61045 -- allow for (and log!) extra bytes in FormatRecord.
authorTim Allison <tallison@apache.org>
Tue, 20 Jun 2017 18:11:34 +0000 (18:11 +0000)
committerTim Allison <tallison@apache.org>
Tue, 20 Jun 2017 18:11:34 +0000 (18:11 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1799360 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/hssf/record/DimensionsRecord.java
src/java/org/apache/poi/hssf/record/FormatRecord.java
src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
test-data/spreadsheet/61045_govdocs1_626534.xls [new file with mode: 0644]

index e326b5cb6f71fc26261cf7e4ad6f3346fefb1e8b..1525d58c6d0ea96f55d0f3e96941a64e61a8d2d6 100644 (file)
@@ -20,6 +20,8 @@
 package org.apache.poi.hssf.record;
 
 import org.apache.poi.util.LittleEndianOutput;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
 
 /**
  * Title:        Dimensions Record<P>
@@ -32,6 +34,9 @@ import org.apache.poi.util.LittleEndianOutput;
  */
 
 public final class DimensionsRecord extends StandardRecord implements Cloneable {
+
+    private static final POILogger logger = POILogFactory.getLogger(DimensionsRecord.class);
+
     public final static short sid = 0x200;
     private int               field_1_first_row;
     private int               field_2_last_row;   // plus 1
@@ -50,6 +55,11 @@ public final class DimensionsRecord extends StandardRecord implements Cloneable
         field_3_first_col = in.readShort();
         field_4_last_col  = in.readShort();
         field_5_zero      = in.readShort();
+        //POI-61045 -- in practice, there can be an extra 2 bytes
+        if (in.available() == 2) {
+            logger.log(POILogger.INFO, "DimensionsRecord has extra 2 bytes.");
+            in.readShort();
+        }
     }
 
     /**
index 955c52c2286ce525dd6fd01cd5a79b7183550406..575f709fb1b5c790ebca7ccd1fe7b0ca695a2367 100644 (file)
 package org.apache.poi.hssf.record;
 
 import org.apache.poi.util.HexDump;
+import org.apache.poi.util.LittleEndianConsts;
 import org.apache.poi.util.LittleEndianOutput;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
 import org.apache.poi.util.StringUtil;
 
 /**
@@ -28,6 +31,9 @@ import org.apache.poi.util.StringUtil;
  * REFERENCE:  PG 317 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)
  */
 public final class FormatRecord extends StandardRecord implements Cloneable {
+
+    private static final POILogger logger = POILogFactory.getLogger(FormatRecord.class);
+
     public final static short sid = 0x041E;
 
     private final int field_1_index_code;
@@ -52,9 +58,9 @@ public final class FormatRecord extends StandardRecord implements Cloneable {
         field_3_hasMultibyte = (in.readByte() & 0x01) != 0;
 
         if (field_3_hasMultibyte) {
-            field_4_formatstring = in.readUnicodeLEString(field_3_unicode_len);
+            field_4_formatstring = readStringCommon(in, field_3_unicode_len, false);
         } else {
-            field_4_formatstring = in.readCompressedUnicode(field_3_unicode_len);
+            field_4_formatstring = readStringCommon(in, field_3_unicode_len, true);
         }
     }
 
@@ -113,4 +119,55 @@ public final class FormatRecord extends StandardRecord implements Cloneable {
     public FormatRecord clone() {
         return new FormatRecord(this);
     }
+
+    private static String readStringCommon(RecordInputStream ris, int requestedLength, boolean pIsCompressedEncoding) {
+        //custom copy of ris.readUnicodeLEString to allow for extra bytes at the end
+
+        // Sanity check to detect garbage string lengths
+        if (requestedLength < 0 || requestedLength > 0x100000) { // 16 million chars?
+            throw new IllegalArgumentException("Bad requested string length (" + requestedLength + ")");
+        }
+        char[] buf = null;
+        boolean isCompressedEncoding = pIsCompressedEncoding;
+        int availableChars = isCompressedEncoding ? ris.remaining() : ris.remaining() / LittleEndianConsts.SHORT_SIZE;
+        //everything worked out.  Great!
+        int remaining = ris.remaining();
+        if (requestedLength == availableChars) {
+            buf = new char[requestedLength];
+        } else {
+            //sometimes in older Excel 97 .xls files,
+            //the requested length is wrong.
+            //Read all available characters.
+            buf = new char[availableChars];
+        }
+        for (int i = 0; i < buf.length; i++) {
+            char ch;
+            if (isCompressedEncoding) {
+                ch = (char) ris.readUByte();
+            } else {
+                ch = (char) ris.readShort();
+            }
+            buf[i] = ch;
+        }
+
+        //TIKA-2154's file shows that even in a unicode string
+        //there can be a remaining byte (without proper final '00')
+        //that should be read as a byte
+        if (ris.available() == 1) {
+            char[] tmp = new char[buf.length+1];
+            System.arraycopy(buf, 0, tmp, 0, buf.length);
+            tmp[buf.length] = (char)ris.readUByte();
+            buf = tmp;
+        }
+
+        if (ris.available() > 0) {
+            logger.log(POILogger.INFO, "FormatRecord has "+ris.available()+" unexplained bytes. Silently skipping");
+            //swallow what's left
+            while (ris.available() > 0) {
+                ris.readByte();
+            }
+        }
+        return new String(buf);
+    }
+
 }
index 2953a5b1acc0b8771aecc9cdaa00953bb4ad200e..1a67ec53a780f4104885106c0f890e151aa2a219 100644 (file)
 
 package org.apache.poi.hssf.extractor;
 
+import static org.apache.poi.POITestCase.assertContains;
+import static org.apache.poi.POITestCase.assertStartsWith;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
-import static org.apache.poi.POITestCase.assertContains;
-import static org.apache.poi.POITestCase.assertStartsWith;
 
 import java.io.File;
 import java.io.IOException;
@@ -388,4 +388,13 @@ public final class TestExcelExtractor {
                assertNotNull(extractor.getText());
                extractor.close();
        }
+
+       @Test
+       public void test61045() throws IOException {
+               //bug 61045. File is govdocs1 626534
+               ExcelExtractor extractor = createExtractor("61045_govdocs1_626534.xls");
+               String txt = extractor.getText();
+               assertContains(txt, "NONBUSINESS");
+       }
+
 }
diff --git a/test-data/spreadsheet/61045_govdocs1_626534.xls b/test-data/spreadsheet/61045_govdocs1_626534.xls
new file mode 100644 (file)
index 0000000..e285403
Binary files /dev/null and b/test-data/spreadsheet/61045_govdocs1_626534.xls differ