From 54f17d41da779d6e13c9af856b8c438da2a4a731 Mon Sep 17 00:00:00 2001 From: Josh Micich Date: Fri, 7 Aug 2009 00:21:00 +0000 Subject: [PATCH] code improvements to RecordFactoryInputStream git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@801850 13f79535-47bb-0310-9956-ffa450edef68 --- .../hssf/eventusermodel/HSSFEventFactory.java | 151 +++---- .../apache/poi/hssf/record/RecordFactory.java | 7 +- .../hssf/record/RecordFactoryInputStream.java | 410 +++++++++--------- 3 files changed, 276 insertions(+), 292 deletions(-) diff --git a/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java b/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java index 2238b39b82..2b392c0e3c 100644 --- a/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java +++ b/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java @@ -31,7 +31,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * processWorkbookEvents along with a request. * * This will cause your file to be processed a record at a time. Each record with - * a static id matching one that you have registed in your HSSFRequest will be passed + * a static id matching one that you have registered in your HSSFRequest will be passed * to your associated HSSFListener. * * @see org.apache.poi.hssf.dev.EFHSSF @@ -39,115 +39,98 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * @author Andrew C. Oliver (acoliver at apache dot org) * @author Carey Sublette (careysub@earthling.net) */ +public class HSSFEventFactory { + /** Creates a new instance of HSSFEventFactory */ + public HSSFEventFactory() { + // no instance fields + } -public class HSSFEventFactory -{ - /** Creates a new instance of HSSFEventFactory */ - - public HSSFEventFactory() - { - } - - /** - * Processes a file into essentially record events. - * - * @param req an Instance of HSSFRequest which has your registered listeners - * @param fs a POIFS filesystem containing your workbook - */ - - public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) - throws IOException - { - InputStream in = fs.createDocumentInputStream("Workbook"); + /** + * Processes a file into essentially record events. + * + * @param req an Instance of HSSFRequest which has your registered listeners + * @param fs a POIFS filesystem containing your workbook + */ + public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException { + InputStream in = fs.createDocumentInputStream("Workbook"); - processEvents(req, in); - } + processEvents(req, in); + } - /** + /** * Processes a file into essentially record events. * - * @param req an Instance of HSSFRequest which has your registered listeners - * @param fs a POIFS filesystem containing your workbook - * @return numeric user-specified result code. + * @param req an Instance of HSSFRequest which has your registered listeners + * @param fs a POIFS filesystem containing your workbook + * @return numeric user-specified result code. */ - public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) - throws IOException, HSSFUserException - { + throws IOException, HSSFUserException { InputStream in = fs.createDocumentInputStream("Workbook"); return abortableProcessEvents(req, in); - } - - /** - * Processes a DocumentInputStream into essentially Record events. - * - * If an AbortableHSSFListener causes a halt to processing during this call - * the method will return just as with abortableProcessEvents, but no - * user code or HSSFUserException will be passed back. - * - * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String) - * @param req an Instance of HSSFRequest which has your registered listeners - * @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object - */ - - public void processEvents(HSSFRequest req, InputStream in) - throws IOException - { - try - { + } + + /** + * Processes a DocumentInputStream into essentially Record events. + * + * If an AbortableHSSFListener causes a halt to processing during this call + * the method will return just as with abortableProcessEvents, but no + * user code or HSSFUserException will be passed back. + * + * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String) + * @param req an Instance of HSSFRequest which has your registered listeners + * @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object + */ + public void processEvents(HSSFRequest req, InputStream in) { + try { genericProcessEvents(req, new RecordInputStream(in)); + } catch (HSSFUserException hue) { + /*If an HSSFUserException user exception is thrown, ignore it.*/ } - catch (HSSFUserException hue) - {/*If an HSSFUserException user exception is thrown, ignore it.*/ } } - /** - * Processes a DocumentInputStream into essentially Record events. - * - * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String) - * @param req an Instance of HSSFRequest which has your registered listeners - * @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object - * @return numeric user-specified result code. - */ - - public short abortableProcessEvents(HSSFRequest req, InputStream in) - throws IOException, HSSFUserException - { + /** + * Processes a DocumentInputStream into essentially Record events. + * + * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String) + * @param req an Instance of HSSFRequest which has your registered listeners + * @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object + * @return numeric user-specified result code. + */ + public short abortableProcessEvents(HSSFRequest req, InputStream in) + throws HSSFUserException { return genericProcessEvents(req, new RecordInputStream(in)); - } + } - /** + /** * Processes a DocumentInputStream into essentially Record events. * * @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String) - * @param req an Instance of HSSFRequest which has your registered listeners - * @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object - * @return numeric user-specified result code. + * @param req an Instance of HSSFRequest which has your registered listeners + * @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object + * @return numeric user-specified result code. */ - protected short genericProcessEvents(HSSFRequest req, RecordInputStream in) - throws IOException, HSSFUserException - { - boolean going = true; + throws HSSFUserException { short userCode = 0; - Record r = null; - + // Create a new RecordStream and use that - RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in); - + RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in, false); + // Process each record as they come in - while(going) { - r = recordStream.nextRecord(); - if(r != null) { - userCode = req.processRecord(r); - if (userCode != 0) break; - } else { - going = false; + while(true) { + Record r = recordStream.nextRecord(); + if(r == null) { + break; + } + userCode = req.processRecord(r); + if (userCode != 0) { + break; } } - + // All done, return our last code return userCode; - } + } } diff --git a/src/java/org/apache/poi/hssf/record/RecordFactory.java b/src/java/org/apache/poi/hssf/record/RecordFactory.java index 53213dfc1a..f458606678 100644 --- a/src/java/org/apache/poi/hssf/record/RecordFactory.java +++ b/src/java/org/apache/poi/hssf/record/RecordFactory.java @@ -369,12 +369,11 @@ public final class RecordFactory { public static List createRecords(InputStream in) throws RecordFormatException { List records = new ArrayList(NUM_RECORDS); - RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in)); - recStream.setIncludeContinueRecords(true); + RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in), true); - Record record; + Record record; while ((record = recStream.nextRecord())!=null) { - records.add(record); + records.add(record); } return records; diff --git a/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java b/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java index 19f2ca45c9..541dfd2dc0 100755 --- a/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java +++ b/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java @@ -19,10 +19,6 @@ package org.apache.poi.hssf.record; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - /** * A stream based way to get at complete records, with * as low a memory footprint as possible. @@ -34,203 +30,209 @@ import java.util.List; * them, but this does allow for a "pull" style of coding. */ public class RecordFactoryInputStream { - private final RecordInputStream recStream; - - /** - * Have we returned all the records there are? - */ - private boolean complete = false; - - /** - * Sometimes we end up with a bunch of - * records. When we do, these should - * be returned before the next normal - * record processing occurs (i.e. before - * we check for continue records and - * return rec) - */ - private final LinkedList bonusRecords = new LinkedList(); - - /** - * The most recent record that we gave to the user - */ - private Record lastRecord = null; - /** - * The most recent DrawingRecord seen - */ - private DrawingRecord lastDrawingRecord = new DrawingRecord(); - - private int bofDepth = 0; - - private boolean lastRecordWasEOFLevelZero = false; - - private boolean includeContinueRecords = false; - - public RecordFactoryInputStream(RecordInputStream inp) { - recStream = inp; - } - - /** - * Returns the next (complete) record from the - * stream, or null if there are no more. - */ - public Record nextRecord() { - Record r = null; - - // Loop until we get something - while (r == null && !complete) { - // Are there any bonus records that we need to - // return? - r = getBonusRecord(); - - // If not, ask for the next real record - if (r == null) { - r = getNextRecord(); - } - } - - // All done - return r; - } - - /** - * If there are any "bonus" records, that should - * be returned before processing new ones, - * grabs the next and returns it. - * If not, returns null; - */ - private Record getBonusRecord() { - if (!bonusRecords.isEmpty()) { - return (Record) bonusRecords.removeFirst(); - } - return null; - } - - /** - * Returns the next available record, or null if - * this pass didn't return a record that's - * suitable for returning (eg was a continue record). - */ - private Record getNextRecord() { - /* - * How to recognise end of stream? - * In the best case, the underlying input stream (in) ends just after the last EOF record - * Usually however, the stream is padded with an arbitrary byte count. Excel and most apps - * reliably use zeros for padding and if this were always the case, this code could just - * skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with - * non-zero padding that is read OK by Excel (Excel also fixes the padding). - * - * So to properly detect the workbook end of stream, this code has to identify the last - * EOF record. This is not so easy because the worbook bof+eof pair do not bracket the - * whole stream. The worksheets follow the workbook, but it is not easy to tell how many - * sheet sub-streams should be present. Hence we are looking for an EOF record that is not - * immediately followed by a BOF record. One extra complication is that bof+eof sub- - * streams can be nested within worksheet streams and it's not clear in these cases what - * record might follow any EOF record. So we also need to keep track of the bof/eof - * nesting level. - */ - - if (recStream.hasNextRecord()) { - // Grab our next record - recStream.nextRecord(); - - if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) { - // Normally InputStream (in) contains only zero padding after this point - complete = true; - return null; - } - - Record record = RecordFactory.createSingleRecord(recStream); - lastRecordWasEOFLevelZero = false; - - if (record instanceof BOFRecord) { - bofDepth++; - return record; - } - - if (record instanceof EOFRecord) { - bofDepth--; - if (bofDepth < 1) { - lastRecordWasEOFLevelZero = true; - } - - return record; - } - - if (record instanceof DBCellRecord) { - // Not needed by POI. Regenerated from scratch by POI when spreadsheet is written - return null; - } - - if (record instanceof RKRecord) { - return RecordFactory.convertToNumberRecord((RKRecord) record); - } - - if (record instanceof MulRKRecord) { - NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record); - - List list = Arrays.asList(records); - bonusRecords.addAll(list.subList(1, list.size())); - - return records[0]; - } - - if (record.getSid() == DrawingGroupRecord.sid - && lastRecord instanceof DrawingGroupRecord) { - DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord; - lastDGRecord.join((AbstractEscherHolderRecord) record); - return null; - } else if (record.getSid() == ContinueRecord.sid) { - ContinueRecord contRec = (ContinueRecord) record; - - if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) { - // Drawing records have a very strange continue behaviour. - //There can actually be OBJ records mixed between the continues. - lastDrawingRecord.processContinueRecord(contRec.getData()); - //we must remember the position of the continue record. - //in the serialization procedure the original structure of records must be preserved - if (includeContinueRecords) { - return record; - } else { - return null; - } - } else if (lastRecord instanceof DrawingGroupRecord) { - ((DrawingGroupRecord) lastRecord).processContinueRecord(contRec.getData()); - return null; - } else if (lastRecord instanceof DrawingRecord) { - ((DrawingRecord) lastRecord).processContinueRecord(contRec.getData()); - return null; - } else if (lastRecord instanceof UnknownRecord) { - //Gracefully handle records that we don't know about, - //that happen to be continued - return record; - } else if (lastRecord instanceof EOFRecord) { - // This is really odd, but excel still sometimes - // outputs a file like this all the same - return record; - } else { - throw new RecordFormatException("Unhandled Continue Record"); - } - } else { - lastRecord = record; - if (record instanceof DrawingRecord) { - lastDrawingRecord = (DrawingRecord) record; - } - - return record; - } - - } else { - // No more records - complete = true; - return null; - } - } - - /** - * Return or not ContinueRecord in nextRecord - */ - public void setIncludeContinueRecords(boolean includeContinueRecords) { - this.includeContinueRecords = includeContinueRecords; - } -} \ No newline at end of file + + private final RecordInputStream _recStream; + private final boolean _shouldIncludeContinueRecords; + + /** + * Temporarily stores a group of {@link NumberRecord}s. This is uses when the most + * recently read underlying record is a {@link MulRKRecord} + */ + private NumberRecord[] _multipleNumberRecords; + + /** + * used to help iterating over multiple number records + */ + private int _multipleNumberRecordIndex = -1; + + /** + * The most recent record that we gave to the user + */ + private Record _lastRecord = null; + /** + * The most recent DrawingRecord seen + */ + private DrawingRecord _lastDrawingRecord = new DrawingRecord(); + + private int _bofDepth; + + private boolean _lastRecordWasEOFLevelZero; + + + /** + * @param shouldIncludeContinueRecords caller can pass false if loose + * {@link ContinueRecord}s should be skipped (this is sometimes useful in event based + * processing). + */ + public RecordFactoryInputStream(RecordInputStream inp, boolean shouldIncludeContinueRecords) { + _recStream = inp; + _shouldIncludeContinueRecords = shouldIncludeContinueRecords; + + /* + * How to recognise end of stream? + * In the best case, the underlying input stream (in) ends just after the last EOF record + * Usually however, the stream is padded with an arbitrary byte count. Excel and most apps + * reliably use zeros for padding and if this were always the case, this code could just + * skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with + * non-zero padding that is read OK by Excel (Excel also fixes the padding). + * + * So to properly detect the workbook end of stream, this code has to identify the last + * EOF record. This is not so easy because the worbook bof+eof pair do not bracket the + * whole stream. The worksheets follow the workbook, but it is not easy to tell how many + * sheet sub-streams should be present. Hence we are looking for an EOF record that is not + * immediately followed by a BOF record. One extra complication is that bof+eof sub- + * streams can be nested within worksheet streams and it's not clear in these cases what + * record might follow any EOF record. So we also need to keep track of the bof/eof + * nesting level. + */ + _bofDepth=0; + _lastRecordWasEOFLevelZero = false; + } + + /** + * Returns the next (complete) record from the + * stream, or null if there are no more. + */ + public Record nextRecord() { + Record r; + r = getNextMultipleNumberRecord(); + if (r != null) { + // found a NumberRecord (expanded from a recent MULRK record) + return r; + } + while (true) { + if (!_recStream.hasNextRecord()) { + // recStream is exhausted; + return null; + } + + // step underlying RecordInputStream to the next record + _recStream.nextRecord(); + + if (_lastRecordWasEOFLevelZero) { + // Potential place for ending the workbook stream + // Check that the next record is not BOFRecord(0x0809) + // Normally the input stream contains only zero padding after the last EOFRecord, + // but bug 46987 suggests that the padding may be garbage. + // This code relies on the padding bytes not starting with BOFRecord.sid + if (_recStream.getSid() != BOFRecord.sid) { + return null; + } + // else - another sheet substream starting here + } + + r = readNextRecord(); + if (r == null) { + // some record types may get skipped (e.g. DBCellRecord and ContinueRecord) + continue; + } + return r; + } + } + + /** + * @return the next {@link NumberRecord} from the multiple record group as expanded from + * a recently read {@link MulRKRecord}. null if not present. + */ + private NumberRecord getNextMultipleNumberRecord() { + if (_multipleNumberRecords != null) { + int ix = _multipleNumberRecordIndex; + if (ix < _multipleNumberRecords.length) { + NumberRecord result = _multipleNumberRecords[ix]; + _multipleNumberRecordIndex = ix + 1; + return result; + } + _multipleNumberRecordIndex = -1; + _multipleNumberRecords = null; + } + return null; + } + + /** + * @return the next available record, or null if + * this pass didn't return a record that's + * suitable for returning (eg was a continue record). + */ + private Record readNextRecord() { + + Record record = RecordFactory.createSingleRecord(_recStream); + _lastRecordWasEOFLevelZero = false; + + if (record instanceof BOFRecord) { + _bofDepth++; + return record; + } + + if (record instanceof EOFRecord) { + _bofDepth--; + if (_bofDepth < 1) { + _lastRecordWasEOFLevelZero = true; + } + + return record; + } + + if (record instanceof DBCellRecord) { + // Not needed by POI. Regenerated from scratch by POI when spreadsheet is written + return null; + } + + if (record instanceof RKRecord) { + return RecordFactory.convertToNumberRecord((RKRecord) record); + } + + if (record instanceof MulRKRecord) { + NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record); + + _multipleNumberRecords = records; + _multipleNumberRecordIndex = 1; + return records[0]; + } + + if (record.getSid() == DrawingGroupRecord.sid + && _lastRecord instanceof DrawingGroupRecord) { + DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) _lastRecord; + lastDGRecord.join((AbstractEscherHolderRecord) record); + return null; + } + if (record.getSid() == ContinueRecord.sid) { + ContinueRecord contRec = (ContinueRecord) record; + + if (_lastRecord instanceof ObjRecord || _lastRecord instanceof TextObjectRecord) { + // Drawing records have a very strange continue behaviour. + //There can actually be OBJ records mixed between the continues. + _lastDrawingRecord.processContinueRecord(contRec.getData()); + //we must remember the position of the continue record. + //in the serialization procedure the original structure of records must be preserved + if (_shouldIncludeContinueRecords) { + return record; + } + return null; + } + if (_lastRecord instanceof DrawingGroupRecord) { + ((DrawingGroupRecord) _lastRecord).processContinueRecord(contRec.getData()); + return null; + } + if (_lastRecord instanceof DrawingRecord) { + ((DrawingRecord) _lastRecord).processContinueRecord(contRec.getData()); + return null; + } + if (_lastRecord instanceof UnknownRecord) { + //Gracefully handle records that we don't know about, + //that happen to be continued + return record; + } + if (_lastRecord instanceof EOFRecord) { + // This is really odd, but excel still sometimes + // outputs a file like this all the same + return record; + } + throw new RecordFormatException("Unhandled Continue Record"); + } + _lastRecord = record; + if (record instanceof DrawingRecord) { + _lastDrawingRecord = (DrawingRecord) record; + } + return record; + } +} -- 2.39.5