<!-- Don't forget to update status.xml too! -->
<release version="3.0.3-beta1" date="2008-04-??">
+ <action dev="POI-DEVELOPERS" type="add">Update HSLFSlideShow and HSSFWorkbook to take advantage of POIFS updates, and allow reading embeded documents</action>
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.0.3-beta1" date="2008-04-??">
+ <action dev="POI-DEVELOPERS" type="add">Update HSLFSlideShow and HSSFWorkbook to take advantage of POIFS updates, and allow reading embeded documents</action>
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
* @see org.apache.poi.poifs.filesystem.POIFSFileSystem
* @exception IOException if the stream cannot be read
*/
-
public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes)
throws IOException
{
- super(fs);
+ this(fs.getRoot(), fs, preserveNodes);
+ }
+
+ /**
+ * given a POI POIFSFileSystem object, and a specific directory
+ * within it, read in its Workbook and populate the high and
+ * low level models. If you're reading in a workbook...start here.
+ *
+ * @param directory the POI filesystem directory to process from
+ * @param fs the POI filesystem that contains the Workbook stream.
+ * @param preserveNodes whether to preseve other nodes, such as
+ * macros. This takes more memory, so only say yes if you
+ * need to. If set, will store all of the POIFSFileSystem
+ * in memory
+ * @see org.apache.poi.poifs.filesystem.POIFSFileSystem
+ * @exception IOException if the stream cannot be read
+ */
+ public HSSFWorkbook(DirectoryNode directory, POIFSFileSystem fs, boolean preserveNodes)
+ throws IOException
+ {
+ super(directory, fs);
this.preserveNodes = preserveNodes;
// If we're not preserving nodes, don't track the
// POIFS any more
if(! preserveNodes) {
this.filesystem = null;
+ this.directory = null;
}
sheets = new ArrayList(INITIAL_CAPACITY);
// put theirs in one called "WORKBOOK"
String workbookName = "Workbook";
try {
- fs.getRoot().getEntry(workbookName);
+ directory.getEntry(workbookName);
// Is the default name
} catch(FileNotFoundException fe) {
// Try the upper case form
try {
workbookName = "WORKBOOK";
- fs.getRoot().getEntry(workbookName);
+ directory.getEntry(workbookName);
} catch(FileNotFoundException wfe) {
// Doesn't contain it in either form
throw new IllegalArgumentException("The supplied POIFSFileSystem contained neither a 'Workbook' entry, nor a 'WORKBOOK' entry. Is it really an excel file?");
// Grab the data from the workbook stream, however
// it happens to be spelt.
- InputStream stream = fs.createDocumentInputStream(workbookName);
+ InputStream stream = directory.createDocumentInputStream(workbookName);
EventRecordFactory factory = new EventRecordFactory();
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.poifs.dev;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * A lister of the entries in POIFS files.
+ *
+ * Much simpler than {@link POIFSViewer}
+ */
+public class POIFSLister {
+ /**
+ * Display the entries of multiple POIFS files
+ *
+ * @param args the names of the files to be displayed
+ */
+ public static void main(final String args[]) throws IOException {
+ if (args.length == 0)
+ {
+ System.err.println("Must specify at least one file to view");
+ System.exit(1);
+ }
+
+ for (int j = 0; j < args.length; j++)
+ {
+ viewFile(args[ j ]);
+ }
+ }
+
+ public static void viewFile(final String filename) throws IOException
+ {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(filename)
+ );
+ displayDirectory(fs.getRoot(), "");
+ }
+
+ public static void displayDirectory(DirectoryNode dir, String indent) {
+ System.out.println(indent + dir.getName() + " -");
+ String newIndent = indent + " ";
+
+ for(Iterator it = dir.getEntries(); it.hasNext(); ) {
+ Object entry = it.next();
+ if(entry instanceof DirectoryNode) {
+ displayDirectory((DirectoryNode)entry, newIndent);
+ } else {
+ DocumentNode doc = (DocumentNode)entry;
+ String name = doc.getName();
+ if(name.charAt(0) < 10) {
+ String altname = "(0x0" + (int)name.charAt(0) + ")" + name.substring(1);
+ name = name.substring(1) + " <" + altname + ">";
+ }
+ System.out.println(newIndent + name);
+ }
+ }
+ }
+}
\ No newline at end of file
import org.apache.poi.hslf.record.UserEditAtom;
import org.apache.poi.hslf.usermodel.ObjectData;
import org.apache.poi.hslf.usermodel.PictureData;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
*/
public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException
{
- super(filesystem);
+ this(filesystem.getRoot(), filesystem);
+ }
+
+ /**
+ * Constructs a Powerpoint document from a specific point in a
+ * POIFS Filesystem. Parses the document and places all the
+ * important stuff into data structures.
+ *
+ * @param dir the POIFS directory to read from
+ * @param filesystem the POIFS FileSystem to read from
+ * @throws IOException if there is a problem while parsing the document.
+ */
+ public HSLFSlideShow(DirectoryNode dir, POIFSFileSystem filesystem) throws IOException
+ {
+ super(dir, filesystem);
// First up, grab the "Current User" stream
// We need this before we can detect Encrypted Documents
{
// Get the main document stream
DocumentEntry docProps =
- (DocumentEntry)filesystem.getRoot().getEntry("PowerPoint Document");
+ (DocumentEntry)directory.getEntry("PowerPoint Document");
// Grab the document stream
_docstream = new byte[docProps.getSize()];
- filesystem.createDocumentInputStream("PowerPoint Document").read(_docstream);
+ directory.createDocumentInputStream("PowerPoint Document").read(_docstream);
}
/**
*/
private void readCurrentUserStream() {
try {
- currentUser = new CurrentUserAtom(filesystem);
+ currentUser = new CurrentUserAtom(directory);
} catch(IOException ie) {
logger.log(POILogger.ERROR, "Error finding Current User Atom:\n" + ie);
currentUser = new CurrentUserAtom();
byte[] pictstream;
try {
- DocumentEntry entry = (DocumentEntry)filesystem.getRoot().getEntry("Pictures");
+ DocumentEntry entry = (DocumentEntry)directory.getEntry("Pictures");
pictstream = new byte[entry.getSize()];
- DocumentInputStream is = filesystem.createDocumentInputStream("Pictures");
+ DocumentInputStream is = directory.createDocumentInputStream("Pictures");
is.read(pictstream);
} catch (FileNotFoundException e){
// Silently catch exceptions if the presentation doesn't
* Find the Current User in the filesystem, and create from that
*/
public CurrentUserAtom(POIFSFileSystem fs) throws IOException {
+ this(fs.getRoot());
+ }
+ /**
+ * Find the Current User in the filesystem, and create from that
+ */
+ public CurrentUserAtom(DirectoryNode dir) throws IOException {
// Decide how big it is
DocumentEntry docProps =
- (DocumentEntry)fs.getRoot().getEntry("Current User");
+ (DocumentEntry)dir.getEntry("Current User");
_contents = new byte[docProps.getSize()];
// Check it's big enough - if it's not at least 28 bytes long, then
}
// Grab the contents
- InputStream in = fs.createDocumentInputStream("Current User");
+ InputStream in = dir.createDocumentInputStream("Current User");
in.read(_contents);
// Set everything up
package org.apache.poi.hslf.extractor;
+import java.io.FileInputStream;
+
+import org.apache.poi.hslf.HSLFSlideShow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
import junit.framework.TestCase;
/**
private PowerPointExtractor ppe2;
/** Where to go looking for our test files */
private String dirname;
+ /** Where our embeded files live */
+ private String pdirname;
public TextExtractor() throws Exception {
dirname = System.getProperty("HSLF.testdata.path");
ppe = new PowerPointExtractor(filename);
String filename2 = dirname + "/with_textbox.ppt";
ppe2 = new PowerPointExtractor(filename2);
+
+ pdirname = System.getProperty("POIFS.testdata.path");
}
public void testReadSheetText() throws Exception {
char[] expC = exp.toCharArray();
char[] actC = act.toCharArray();
for(int i=0; i<expC.length; i++) {
- System.out.println(i + "\t" + expC[i] + " " + actC[i]);
- assertEquals(expC[i],actC[i]);
+ assertEquals("Char " + i, expC[i], actC[i]);
}
assertEquals(exp,act);
}
+
+ public void testExtractFromEmbeded() throws Exception {
+ String filename3 = pdirname + "/excel_with_embeded.xls";
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(filename3)
+ );
+ HSLFSlideShow ss;
+
+ DirectoryNode dirA = (DirectoryNode)
+ fs.getRoot().getEntry("MBD0000A3B6");
+ DirectoryNode dirB = (DirectoryNode)
+ fs.getRoot().getEntry("MBD0000A3B3");
+
+ assertNotNull(dirA.getEntry("PowerPoint Document"));
+ assertNotNull(dirB.getEntry("PowerPoint Document"));
+
+ // Check the first file
+ ss = new HSLFSlideShow(dirA, fs);
+ ppe = new PowerPointExtractor(ss);
+ assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
+ ppe.getText(true, false)
+ );
+
+ // And the second
+ ss = new HSLFSlideShow(dirB, fs);
+ ppe = new PowerPointExtractor(ss);
+ assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n",
+ ppe.getText(true, false)
+ );
+ }
+
+ /**
+ * A powerpoint file with embeded powerpoint files
+ * TODO - figure out how to handle this, as ppt
+ * appears to embed not as ole2 streams
+ */
+ public void DISABLEDtestExtractFromOwnEmbeded() throws Exception {
+ String filename3 = pdirname + "/ppt_with_embeded.ppt";
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(filename3)
+ );
+ HSLFSlideShow ss;
+
+ DirectoryNode dirA = (DirectoryNode)
+ fs.getRoot().getEntry("MBD0000A3B6");
+ DirectoryNode dirB = (DirectoryNode)
+ fs.getRoot().getEntry("MBD0000A3B3");
+
+ assertNotNull(dirA.getEntry("PowerPoint Document"));
+ assertNotNull(dirB.getEntry("PowerPoint Document"));
+
+ // Check the first file
+ ss = new HSLFSlideShow(dirA, fs);
+ ppe = new PowerPointExtractor(ss);
+ assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
+ ppe.getText(true, false)
+ );
+
+ // And the second
+ ss = new HSLFSlideShow(dirB, fs);
+ ppe = new PowerPointExtractor(ss);
+ assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n",
+ ppe.getText(true, false)
+ );
+
+
+ // Check the master doc two ways
+ ss = new HSLFSlideShow(fs.getRoot(), fs);
+ ppe = new PowerPointExtractor(ss);
+ assertEquals("I have embeded files in me\n",
+ ppe.getText(true, false)
+ );
+
+ ss = new HSLFSlideShow(fs);
+ ppe = new PowerPointExtractor(ss);
+ assertEquals("I have embeded files in me\n",
+ ppe.getText(true, false)
+ );
+ }
}
package org.apache.poi.hwpf.extractor;
import java.io.FileInputStream;
-import java.util.Iterator;
+
+import junit.framework.TestCase;
import org.apache.poi.hwpf.HWPFDocument;
-import org.apache.poi.hwpf.model.TextPiece;
-import org.apache.poi.hwpf.usermodel.Paragraph;
-import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import junit.framework.TestCase;
-
/**
* Test the different routes to extracting text
*
/**
- * Test that we can get data from an
- * embeded word document
+ * Test that we can get data from two different
+ * embeded word documents
* @throws Exception
*/
public void testExtractFromEmbeded() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
- DirectoryNode dir = (DirectoryNode)
- fs.getRoot().getEntry("MBD03F25D8D");
+ HWPFDocument doc;
+ WordExtractor extractor3;
+
+ DirectoryNode dirA = (DirectoryNode)
+ fs.getRoot().getEntry("MBD0000A3B7");
+ DirectoryNode dirB = (DirectoryNode)
+ fs.getRoot().getEntry("MBD0000A3B2");
+
// Should have WordDocument and 1Table
- assertNotNull(dir.getEntry("1Table"));
- assertNotNull(dir.getEntry("WordDocument"));
+ assertNotNull(dirA.getEntry("1Table"));
+ assertNotNull(dirA.getEntry("WordDocument"));
+
+ assertNotNull(dirB.getEntry("1Table"));
+ assertNotNull(dirB.getEntry("WordDocument"));
+
+ // Check each in turn
+ doc = new HWPFDocument(dirA, fs);
+ extractor3 = new WordExtractor(doc);
+
+ assertNotNull(extractor3.getText());
+ assertTrue(extractor3.getText().length() > 20);
+ assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n",
+ extractor3.getText());
+ assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle());
+ assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject());
+
- HWPFDocument doc = new HWPFDocument(dir, fs);
- WordExtractor extractor3 = new WordExtractor(doc);
+ doc = new HWPFDocument(dirB, fs);
+ extractor3 = new WordExtractor(doc);
assertNotNull(extractor3.getText());
assertTrue(extractor3.getText().length() > 20);
+ assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n",
+ extractor3.getText());
+ assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
+ assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
}
}
package org.apache.poi.hssf.extractor;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import junit.framework.TestCase;
import org.apache.poi.hssf.HSSFTestDataSamples;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
*
assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
}
+
+ /**
+ * Embded in a non-excel file
+ */
+ public void testWithEmbeded() throws Exception {
+ String pdirname = System.getProperty("POIFS.testdata.path");
+ String filename = pdirname + "/word_with_embeded.doc";
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(filename)
+ );
+
+ DirectoryNode objPool = (DirectoryNode)
+ fs.getRoot().getEntry("ObjectPool");
+ DirectoryNode dirA = (DirectoryNode)
+ objPool.getEntry("_1269427460");
+ DirectoryNode dirB = (DirectoryNode)
+ objPool.getEntry("_1269427461");
+
+ HSSFWorkbook wbA = new HSSFWorkbook(dirA, fs, true);
+ HSSFWorkbook wbB = new HSSFWorkbook(dirB, fs, true);
+
+ ExcelExtractor exA = new ExcelExtractor(wbA);
+ ExcelExtractor exB = new ExcelExtractor(wbB);
+
+ assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n",
+ exA.getText());
+ assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
+
+ assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n",
+ exB.getText());
+ assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
+ }
+
+ /**
+ * Excel embeded in excel
+ */
+ public void testWithEmbededInOwn() throws Exception {
+ String pdirname = System.getProperty("POIFS.testdata.path");
+ String filename = pdirname + "/excel_with_embeded.xls";
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(filename)
+ );
+
+ DirectoryNode dirA = (DirectoryNode)
+ fs.getRoot().getEntry("MBD0000A3B5");
+ DirectoryNode dirB = (DirectoryNode)
+ fs.getRoot().getEntry("MBD0000A3B4");
+
+ HSSFWorkbook wbA = new HSSFWorkbook(dirA, fs, true);
+ HSSFWorkbook wbB = new HSSFWorkbook(dirB, fs, true);
+
+ ExcelExtractor exA = new ExcelExtractor(wbA);
+ ExcelExtractor exB = new ExcelExtractor(wbB);
+
+ assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n",
+ exA.getText());
+ assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
+
+ assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n",
+ exB.getText());
+ assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
+
+ // And the base file too
+ ExcelExtractor ex = new ExcelExtractor(fs);
+ assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n",
+ ex.getText());
+ assertEquals("Excel With Embeded", ex.getSummaryInformation().getTitle());
+ }
}