file="${main.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="HPSF.testdata.path"
file="${main.src.test}/org/apache/poi/hpsf/data"/>
+ <sysproperty key="POIFS.testdata.path"
+ file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain"/>
<formatter type="xml"/>
file="${main.src.test}/org/apache/poi/hpsf/data"/>
<sysproperty key="HWPF.testdata.path"
file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
+ <sysproperty key="POIFS.testdata.path"
+ file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/>
<batchtest todir="${main.reports.test}">
<sysproperty key="HWPF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
+ <sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/>
<formatter type="xml"/>
<classpath refid="test.classpath"/>
<sysproperty key="HSSF.testdata.path" file="${main.src.test}/org/apache/poi/hssf/data"/>
<sysproperty key="HPSF.testdata.path" file="${main.src.test}/org/apache/poi/hpsf/data"/>
+ <sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/>
<test name="${testcase}"/>
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
+ <sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain"/>
<formatter type="xml"/>
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
+ <sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/>
<!-- Don't forget to update status.xml too! -->
<release version="3.0.3-beta1" date="2008-04-??">
+ <action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
<action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action>
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.0.3-beta1" date="2008-04-??">
+ <action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
<action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action>
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
protected DocumentSummaryInformation dsInf;
/** The open POIFS FileSystem that contains our document */
protected POIFSFileSystem filesystem;
+ /** The directory that our document lives in */
+ protected DirectoryNode directory;
/** For our own logging use */
protected POILogger logger = POILogFactory.getLogger(this.getClass());
/* Have the property streams been read yet? (Only done on-demand) */
protected boolean initialized = false;
+
+
+ protected POIDocument(DirectoryNode dir, POIFSFileSystem fs) {
+ this.filesystem = fs;
+ this.directory = dir;
+ }
+ protected POIDocument(POIFSFileSystem fs) {
+ this(fs.getRoot(), fs);
+ }
/**
* Fetch the Document Summary Information of the document
DocumentInputStream dis;
try {
// Find the entry, and get an input stream for it
- dis = filesystem.createDocumentInputStream(setName);
+ dis = directory.createDocumentInputStream(setName);
} catch(IOException ie) {
// Oh well, doesn't exist
logger.log(POILogger.WARN, "Error getting property set with name " + setName + "\n" + ie);
protected HSSFWorkbook( Workbook book )
{
+ super(null, null);
workbook = book;
sheets = new ArrayList( INITIAL_CAPACITY );
names = new ArrayList( INITIAL_CAPACITY );
public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes)
throws IOException
{
+ super(fs);
this.preserveNodes = preserveNodes;
- this.filesystem = fs;
// If we're not preserving nodes, don't track the
// POIFS any more
{
return _path;
}
+
+ /**
+ * open a document in the directory's entry's list of entries
+ *
+ * @param documentName the name of the document to be opened
+ *
+ * @return a newly opened DocumentInputStream
+ *
+ * @exception IOException if the document does not exist or the
+ * name is that of a DirectoryEntry
+ */
+
+ public DocumentInputStream createDocumentInputStream(
+ final String documentName)
+ throws IOException
+ {
+ Entry document = getEntry(documentName);
+
+ if (!document.isDocumentEntry())
+ {
+ throw new IOException("Entry '" + documentName
+ + "' is not a DocumentEntry");
+ }
+ return new DocumentInputStream(( DocumentEntry ) document);
+ }
/**
* create a new DocumentEntry
{
return getRoot().createDirectory(name);
}
-
+
/**
* Write the filesystem out
*
* @return the root entry
*/
- public DirectoryEntry getRoot()
+ public DirectoryNode getRoot()
{
if (_root == null)
{
final String documentName)
throws IOException
{
- Entry document = getRoot().getEntry(documentName);
-
- if (!document.isDocumentEntry())
- {
- throw new IOException("Entry '" + documentName
- + "' is not a DocumentEntry");
- }
- return new DocumentInputStream(( DocumentEntry ) document);
+ return getRoot().createDocumentInputStream(documentName);
}
/**
private PointerFactory ptrFactory;
public HDGFDiagram(POIFSFileSystem fs) throws IOException {
- filesystem = fs;
+ super(fs);
DocumentEntry docProps =
(DocumentEntry)filesystem.getRoot().getEntry("VisioDocument");
*/
public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException
{
- this.filesystem = filesystem;
+ super(filesystem);
// First up, grab the "Current User" stream
// We need this before we can detect Encrypted Documents
import java.util.Iterator;
import org.apache.poi.POIDocument;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.common.POIFSConstants;
protected HWPFDocument()
{
-
+ super(null, null);
}
/**
//do Ole stuff
this( verifyAndBuildPOIFS(istream) );
}
-
+
/**
* This constructor loads a Word document from a POIFSFileSystem
*
* in POIFSFileSystem.
*/
public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException
+ {
+ this(pfilesystem.getRoot(), pfilesystem);
+ }
+
+ /**
+ * This constructor loads a Word document from a specific point
+ * in a POIFSFileSystem, probably not the default.
+ * Used typically to open embeded documents.
+ *
+ * @param pfilesystem The POIFSFileSystem that contains the Word document.
+ * @throws IOException If there is an unexpected IOException from the passed
+ * in POIFSFileSystem.
+ */
+ public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{
// Sort out the hpsf properties
- filesystem = pfilesystem;
+ super(directory, pfilesystem);
readProperties();
// read in the main stream.
- DocumentEntry documentProps =
- (DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
+ DocumentEntry documentProps = (DocumentEntry)
+ directory.getEntry("WordDocument");
_mainStream = new byte[documentProps.getSize()];
- filesystem.createDocumentInputStream("WordDocument").read(_mainStream);
+
+ directory.createDocumentInputStream("WordDocument").read(_mainStream);
// use the fib to determine the name of the table stream.
_fib = new FileInformationBlock(_mainStream);
DocumentEntry tableProps;
try {
tableProps =
- (DocumentEntry)filesystem.getRoot().getEntry(name);
+ (DocumentEntry)directory.getEntry(name);
} catch(FileNotFoundException fnfe) {
throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)");
}
// read in the table stream.
_tableStream = new byte[tableProps.getSize()];
- filesystem.createDocumentInputStream(name).read(_tableStream);
+ directory.createDocumentInputStream(name).read(_tableStream);
_fib.fillVariableFields(_mainStream, _tableStream);
try
{
DocumentEntry dataProps =
- (DocumentEntry) filesystem.getRoot().getEntry("Data");
+ (DocumentEntry)directory.getEntry("Data");
_dataStream = new byte[dataProps.getSize()];
filesystem.createDocumentInputStream("Data").read(_dataStream);
}
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase;
private WordExtractor extractor;
// Corrupted document - can't do paragraph based stuff
private WordExtractor extractor2;
+ // A word doc embeded in an excel file
+ private String filename3;
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
+ String pdirname = System.getProperty("POIFS.testdata.path");
String filename = dirname + "/test2.doc";
String filename2 = dirname + "/test.doc";
+ filename3 = pdirname + "/excel_with_embeded.xls";
extractor = new WordExtractor(new FileInputStream(filename));
extractor2 = new WordExtractor(new FileInputStream(filename2));
String text = extractor.getTextFromPieces();
assertEquals(p_text1_block, text);
}
+
+
+ /**
+ * Test that we can get data from an
+ * embeded word document
+ * @throws Exception
+ */
+ public void testExtractFromEmbeded() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
+ DirectoryNode dir = (DirectoryNode)
+ fs.getRoot().getEntry("MBD03F25D8D");
+ // Should have WordDocument and 1Table
+ assertNotNull(dir.getEntry("1Table"));
+ assertNotNull(dir.getEntry("WordDocument"));
+
+ HWPFDocument doc = new HWPFDocument(dir, fs);
+ WordExtractor extractor3 = new WordExtractor(doc);
+
+ assertNotNull(extractor3.getText());
+ assertTrue(extractor3.getText().length() > 20);
+ }
}