<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">49020 - Workaround Excel outputting invalid XML in button definitions by not closing BR tags</action>
<action dev="POI-DEVELOPERS" type="fix">49050 - Improve performance of AbstractEscherHolderRecord when there are lots of Continue Records</action>
<action dev="POI-DEVELOPERS" type="fix">49194 - Correct text size limit for OOXML .xlsx files</action>
<action dev="POI-DEVELOPERS" type="fix">49254 - Fix CellUtils.setFont to use the correct type internally</action>
import org.apache.poi.POIXMLDocumentPart;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.xssf.util.EvilUnclosedBRFixingInputStream;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlOptions;
import org.apache.xmlbeans.XmlObject;
* considered a deprecated format included in Office Open XML for legacy reasons only and new applications that
* need a file format for drawings are strongly encouraged to use preferentially DrawingML
* </p>
+ *
+ * <p>
+ * Warning - Excel is known to put invalid XML into these files!
+ * For example, >br< without being closed or escaped crops up.
+ * </p>
*
* See 6.4 VML - SpreadsheetML Drawing in Office Open XML Part 4 - Markup Language Reference.pdf
*
protected void read(InputStream is) throws IOException, XmlException {
- XmlObject root = XmlObject.Factory.parse(is);
+ XmlObject root = XmlObject.Factory.parse(
+ new EvilUnclosedBRFixingInputStream(is)
+ );
_qnames = new ArrayList<QName>();
_items = new ArrayList<XmlObject>();
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xssf.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+
+/**
+ * This is a seriously sick fix for the fact that some .xlsx
+ * files contain raw bits of HTML, without being escaped
+ * or properly turned into XML.
+ * The result is that they contain things like >br<,
+ * which breaks the XML parsing.
+ * This very sick InputStream wrapper attempts to spot
+ * these go past, and fix them.
+ * Only works for UTF-8 and US-ASCII based streams!
+ * It should only be used where experience shows the problem
+ * can occur...
+ */
+public class EvilUnclosedBRFixingInputStream extends InputStream {
+ private InputStream source;
+ private byte[] spare;
+
+ private static byte[] detect = new byte[] {
+ (byte)'<', (byte)'b', (byte)'r', (byte)'>'
+ };
+
+ public EvilUnclosedBRFixingInputStream(InputStream source) {
+ this.source = source;
+ }
+
+ /**
+ * Warning - doesn't fix!
+ */
+ @Override
+ public int read() throws IOException {
+ return source.read();
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ if(spare != null) {
+ // This is risky, but spare is normally only a byte or two...
+ System.arraycopy(spare, 0, b, off, spare.length);
+ int ret = spare.length;
+ spare = null;
+ return ret;
+ }
+
+ int read = source.read(b, off, len);
+ read = fixUp(b, off, read);
+ return read;
+ }
+
+ @Override
+ public int read(byte[] b) throws IOException {
+ return this.read(b, 0, b.length);
+ }
+
+ private int fixUp(byte[] b, int offset, int read) {
+ // Find places to fix
+ ArrayList<Integer> fixAt = new ArrayList<Integer>();
+ for(int i=offset; i<offset+read-4; i++) {
+ boolean going = true;
+ for(int j=0; j<detect.length && going; j++) {
+ if(b[i+j] != detect[j]) {
+ going = false;
+ }
+ }
+ if(going) {
+ fixAt.add(i);
+ }
+ }
+
+ if(fixAt.size()==0) {
+ return read;
+ }
+
+ // Save a bit, if needed to fit
+ int overshoot = offset+read+fixAt.size() - b.length;
+ if(overshoot > 0) {
+ spare = new byte[overshoot];
+ System.arraycopy(b, b.length-overshoot, spare, 0, overshoot);
+ read -= overshoot;
+ }
+
+ // Fix them, in reverse order so the
+ // positions are valid
+ for(int j=fixAt.size()-1; j>=0; j--) {
+ int i = fixAt.get(j);
+
+ byte[] tmp = new byte[read-i-3];
+ System.arraycopy(b, i+3, tmp, 0, tmp.length);
+ b[i+3] = (byte)'/';
+ System.arraycopy(tmp, 0, b, i+4, tmp.length);
+ // It got one longer
+ read++;
+ }
+ return read;
+ }
+}
assertEquals(1, rels.size());
assertEquals("Sheet1!A1", rels.get(0).getPackageRelationship().getTargetURI().getFragment());
}
+
+ /**
+ * Excel will sometimes write a button with a textbox
+ * containing >br< (not closed!).
+ * Clearly Excel shouldn't do this, but test that we can
+ * read the file despite the naughtyness
+ */
+ public void test49020() throws Exception {
+ XSSFWorkbook wb = XSSFTestDataSamples.openSampleWorkbook("BrNotClosed.xlsx");
+ }
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xssf.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+
+import junit.framework.TestCase;
+
+public final class TestEvilUnclosedBRFixingInputStream extends TestCase {
+ public void testOK() throws Exception {
+ byte[] ok = "<p><div>Hello There!</div> <div>Tags!</div></p>".getBytes("UTF-8");
+
+ EvilUnclosedBRFixingInputStream inp = new EvilUnclosedBRFixingInputStream(
+ new ByteArrayInputStream(ok)
+ );
+
+ ByteArrayOutputStream bout = new ByteArrayOutputStream();
+ boolean going = true;
+ while(going) {
+ byte[] b = new byte[1024];
+ int r = inp.read(b);
+ if(r > 0) {
+ bout.write(b, 0, r);
+ } else {
+ going = false;
+ }
+ }
+
+ byte[] result = bout.toByteArray();
+ assertEquals(ok, result);
+ }
+
+ public void testProblem() throws Exception {
+ byte[] orig = "<p><div>Hello<br>There!</div> <div>Tags!</div></p>".getBytes("UTF-8");
+ byte[] fixed = "<p><div>Hello<br/>There!</div> <div>Tags!</div></p>".getBytes("UTF-8");
+
+ EvilUnclosedBRFixingInputStream inp = new EvilUnclosedBRFixingInputStream(
+ new ByteArrayInputStream(orig)
+ );
+
+ ByteArrayOutputStream bout = new ByteArrayOutputStream();
+ boolean going = true;
+ while(going) {
+ byte[] b = new byte[1024];
+ int r = inp.read(b);
+ if(r > 0) {
+ bout.write(b, 0, r);
+ } else {
+ going = false;
+ }
+ }
+
+ byte[] result = bout.toByteArray();
+ assertEquals(fixed, result);
+ }
+
+ protected void assertEquals(byte[] a, byte[] b) {
+ assertEquals(a.length, b.length);
+ for(int i=0; i<a.length; i++) {
+ assertEquals("Wrong byte at index " + i, a[i], b[i]);
+ }
+ }
+}