git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@959346 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_7_BETA2
@@ -17,26 +17,43 @@ | |||
package org.apache.poi.hwpf; | |||
import java.io.InputStream; | |||
import java.io.ByteArrayInputStream; | |||
import java.io.FileInputStream; | |||
import java.io.FileNotFoundException; | |||
import java.io.PushbackInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.io.OutputStream; | |||
import java.io.ByteArrayInputStream; | |||
import java.util.Iterator; | |||
import org.apache.poi.EncryptedDocumentException; | |||
import org.apache.poi.POIDocument; | |||
import org.apache.poi.hwpf.model.CHPBinTable; | |||
import org.apache.poi.hwpf.model.CPSplitCalculator; | |||
import org.apache.poi.hwpf.model.ComplexFileTable; | |||
import org.apache.poi.hwpf.model.DocumentProperties; | |||
import org.apache.poi.hwpf.model.EscherRecordHolder; | |||
import org.apache.poi.hwpf.model.FSPATable; | |||
import org.apache.poi.hwpf.model.FileInformationBlock; | |||
import org.apache.poi.hwpf.model.FontTable; | |||
import org.apache.poi.hwpf.model.GenericPropertyNode; | |||
import org.apache.poi.hwpf.model.ListTables; | |||
import org.apache.poi.hwpf.model.PAPBinTable; | |||
import org.apache.poi.hwpf.model.PicturesTable; | |||
import org.apache.poi.hwpf.model.PlexOfCps; | |||
import org.apache.poi.hwpf.model.PropertyNode; | |||
import org.apache.poi.hwpf.model.RevisionMarkAuthorTable; | |||
import org.apache.poi.hwpf.model.SavedByTable; | |||
import org.apache.poi.hwpf.model.SectionTable; | |||
import org.apache.poi.hwpf.model.ShapesTable; | |||
import org.apache.poi.hwpf.model.StyleSheet; | |||
import org.apache.poi.hwpf.model.TextPiece; | |||
import org.apache.poi.hwpf.model.TextPieceTable; | |||
import org.apache.poi.hwpf.model.io.HWPFFileSystem; | |||
import org.apache.poi.hwpf.model.io.HWPFOutputStream; | |||
import org.apache.poi.hwpf.usermodel.HWPFList; | |||
import org.apache.poi.hwpf.usermodel.Range; | |||
import org.apache.poi.poifs.common.POIFSConstants; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.poifs.filesystem.DocumentEntry; | |||
import org.apache.poi.poifs.common.POIFSConstants; | |||
import org.apache.poi.hwpf.model.*; | |||
import org.apache.poi.hwpf.model.io.*; | |||
import org.apache.poi.hwpf.usermodel.*; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
/** | |||
@@ -46,17 +63,11 @@ import org.apache.poi.hwpf.usermodel.*; | |||
* | |||
* @author Ryan Ackley | |||
*/ | |||
public final class HWPFDocument extends POIDocument | |||
// implements Cloneable | |||
public final class HWPFDocument extends HWPFDocumentCore | |||
{ | |||
/** The FIB */ | |||
protected FileInformationBlock _fib; | |||
/** And for making sense of CP lengths in the FIB */ | |||
protected CPSplitCalculator _cpSplit; | |||
/** main document stream buffer*/ | |||
protected byte[] _mainStream; | |||
/** table stream buffer*/ | |||
protected byte[] _tableStream; | |||
@@ -110,29 +121,7 @@ public final class HWPFDocument extends POIDocument | |||
protected HWPFDocument() | |||
{ | |||
super(null, null); | |||
} | |||
/** | |||
* Takens an InputStream, verifies that it's not RTF, builds a | |||
* POIFSFileSystem from it, and returns that. | |||
*/ | |||
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException { | |||
// Open a PushbackInputStream, so we can peek at the first few bytes | |||
PushbackInputStream pis = new PushbackInputStream(istream,6); | |||
byte[] first6 = new byte[6]; | |||
pis.read(first6); | |||
// Does it start with {\rtf ? If so, it's really RTF | |||
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r' | |||
&& first6[3] == 't' && first6[4] == 'f') { | |||
throw new IllegalArgumentException("The document is really a RTF file"); | |||
} | |||
// OK, so it's not RTF | |||
// Open a POIFSFileSystem on the (pushed back) stream | |||
pis.unread(first6); | |||
return new POIFSFileSystem(pis); | |||
super(); | |||
} | |||
/** | |||
@@ -171,21 +160,16 @@ public final class HWPFDocument extends POIDocument | |||
*/ | |||
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException | |||
{ | |||
// Sort out the hpsf properties | |||
// Load the main stream and FIB | |||
// Also handles HPSF bits | |||
super(directory, pfilesystem); | |||
// read in the main stream. | |||
DocumentEntry documentProps = (DocumentEntry) | |||
directory.getEntry("WordDocument"); | |||
_mainStream = new byte[documentProps.getSize()]; | |||
directory.createDocumentInputStream("WordDocument").read(_mainStream); | |||
// Create our FIB, and check for the doc being encrypted | |||
_fib = new FileInformationBlock(_mainStream); | |||
// Do the CP Split | |||
_cpSplit = new CPSplitCalculator(_fib); | |||
if(_fib.isFEncrypted()) { | |||
throw new EncryptedDocumentException("Cannot process encrypted word files!"); | |||
// Is this document too old for us? | |||
if(_fib.getNFib() < 106) { | |||
throw new OldWordFileFormatException("The document is too old (Word 95 or older) "); | |||
} | |||
// use the fib to determine the name of the table stream. | |||
@@ -691,17 +675,4 @@ public final class HWPFDocument extends POIDocument | |||
t.printStackTrace(); | |||
} | |||
} | |||
// public Object clone() | |||
// throws CloneNotSupportedException | |||
// { | |||
// _tpt; | |||
// | |||
// _cbt; | |||
// | |||
// _pbt; | |||
// | |||
// _st; | |||
// | |||
// } | |||
} |
@@ -0,0 +1,130 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.io.PushbackInputStream; | |||
import org.apache.poi.EncryptedDocumentException; | |||
import org.apache.poi.POIDocument; | |||
import org.apache.poi.hwpf.model.FileInformationBlock; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.DocumentEntry; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
/** | |||
* This class holds much of the core of a Word document, but | |||
* without some of the table structure information. | |||
* You generally want to work with one of | |||
* {@link HWPFDocument} or {@link HWPFOldDocument} | |||
*/ | |||
public abstract class HWPFDocumentCore extends POIDocument | |||
{ | |||
/** The FIB */ | |||
protected FileInformationBlock _fib; | |||
/** main document stream buffer*/ | |||
protected byte[] _mainStream; | |||
protected HWPFDocumentCore() | |||
{ | |||
super(null, null); | |||
} | |||
/** | |||
* Takens an InputStream, verifies that it's not RTF, builds a | |||
* POIFSFileSystem from it, and returns that. | |||
*/ | |||
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException { | |||
// Open a PushbackInputStream, so we can peek at the first few bytes | |||
PushbackInputStream pis = new PushbackInputStream(istream,6); | |||
byte[] first6 = new byte[6]; | |||
pis.read(first6); | |||
// Does it start with {\rtf ? If so, it's really RTF | |||
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r' | |||
&& first6[3] == 't' && first6[4] == 'f') { | |||
throw new IllegalArgumentException("The document is really a RTF file"); | |||
} | |||
// OK, so it's not RTF | |||
// Open a POIFSFileSystem on the (pushed back) stream | |||
pis.unread(first6); | |||
return new POIFSFileSystem(pis); | |||
} | |||
/** | |||
* This constructor loads a Word document from an InputStream. | |||
* | |||
* @param istream The InputStream that contains the Word document. | |||
* @throws IOException If there is an unexpected IOException from the passed | |||
* in InputStream. | |||
*/ | |||
public HWPFDocumentCore(InputStream istream) throws IOException | |||
{ | |||
//do Ole stuff | |||
this( verifyAndBuildPOIFS(istream) ); | |||
} | |||
/** | |||
* This constructor loads a Word document from a POIFSFileSystem | |||
* | |||
* @param pfilesystem The POIFSFileSystem that contains the Word document. | |||
* @throws IOException If there is an unexpected IOException from the passed | |||
* in POIFSFileSystem. | |||
*/ | |||
public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException | |||
{ | |||
this(pfilesystem.getRoot(), pfilesystem); | |||
} | |||
/** | |||
* This constructor loads a Word document from a specific point | |||
* in a POIFSFileSystem, probably not the default. | |||
* Used typically to open embeded documents. | |||
* | |||
* @param pfilesystem The POIFSFileSystem that contains the Word document. | |||
* @throws IOException If there is an unexpected IOException from the passed | |||
* in POIFSFileSystem. | |||
*/ | |||
public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException | |||
{ | |||
// Sort out the hpsf properties | |||
super(directory, pfilesystem); | |||
// read in the main stream. | |||
DocumentEntry documentProps = (DocumentEntry) | |||
directory.getEntry("WordDocument"); | |||
_mainStream = new byte[documentProps.getSize()]; | |||
directory.createDocumentInputStream("WordDocument").read(_mainStream); | |||
// Create our FIB, and check for the doc being encrypted | |||
_fib = new FileInformationBlock(_mainStream); | |||
if(_fib.isFEncrypted()) { | |||
throw new EncryptedDocumentException("Cannot process encrypted word files!"); | |||
} | |||
} | |||
public FileInformationBlock getFileInformationBlock() | |||
{ | |||
return _fib; | |||
} | |||
} |
@@ -0,0 +1,135 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf; | |||
import java.io.IOException; | |||
import java.io.OutputStream; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
import org.apache.poi.hwpf.model.CHPX; | |||
import org.apache.poi.hwpf.model.ComplexFileTable; | |||
import org.apache.poi.hwpf.model.OldCHPBinTable; | |||
import org.apache.poi.hwpf.model.PieceDescriptor; | |||
import org.apache.poi.hwpf.model.TextPiece; | |||
import org.apache.poi.hwpf.model.TextPieceTable; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.util.LittleEndian; | |||
/** | |||
* Provides very simple support for old (Word 6 / Word 95) | |||
* files. | |||
* TODO Provide a way to get at the properties associated | |||
* with each block of text | |||
*/ | |||
public class HWPFOldDocument extends HWPFDocumentCore { | |||
private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>(); | |||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException { | |||
this(fs.getRoot(), fs); | |||
} | |||
public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs) | |||
throws IOException { | |||
super(directory, fs); | |||
// Where are things? | |||
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8); | |||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); | |||
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); | |||
// We need to get hold of the text that makes up the | |||
// document, which might be regular or fast-saved | |||
StringBuffer text = new StringBuffer(); | |||
TextPieceTable tpt; | |||
if(_fib.isFComplex()) { | |||
ComplexFileTable cft = new ComplexFileTable( | |||
_mainStream, _mainStream, | |||
complexTableOffset, _fib.getFcMin() | |||
); | |||
tpt = cft.getTextPieceTable(); | |||
for(TextPiece tp : tpt.getTextPieces()) { | |||
text.append( tp.getStringBuffer() ); | |||
} | |||
} else { | |||
// TODO Build the Piece Descriptor properly | |||
// TODO Can these old documents ever contain Unicode strings? | |||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0); | |||
pd.setFilePosition(_fib.getFcMin()); | |||
tpt = new TextPieceTable(); | |||
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()]; | |||
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length); | |||
TextPiece tp = new TextPiece( | |||
0, textData.length, textData, pd, 0 | |||
); | |||
tpt.getTextPieces().add(tp); | |||
text.append(tp.getStringBuffer()); | |||
} | |||
// Now we can fetch the character and paragraph properties | |||
OldCHPBinTable chpTable = new OldCHPBinTable( | |||
_mainStream, chpTableOffset, chpTableSize, | |||
_fib.getFcMin(), tpt | |||
); | |||
// Finally build up runs | |||
for(CHPX chpx : chpTable.getTextRuns()) { | |||
String str = text.substring(chpx.getStart(), chpx.getEnd()); | |||
contents.add(new TextAndCHPX(str,chpx)); | |||
} | |||
} | |||
@Override | |||
public void write(OutputStream out) throws IOException { | |||
throw new IllegalStateException("Writing is not available for the older file formats"); | |||
} | |||
/** | |||
* Retrieves all our text, in order, along with the | |||
* CHPX information on each bit. | |||
* Every entry has the same formatting, but as yet | |||
* we've no way to tell what the formatting is... | |||
* Warnings - this will change as soon as we support | |||
* text formatting! | |||
*/ | |||
public List<TextAndCHPX> getContents() { | |||
return contents; | |||
} | |||
/** | |||
* Warnings - this will change as soon as we support | |||
* text formatting! | |||
*/ | |||
public static class TextAndCHPX { | |||
private String text; | |||
private CHPX chpx; | |||
private TextAndCHPX(String text, CHPX chpx) { | |||
this.text = text; | |||
this.chpx = chpx; | |||
} | |||
public String getText() { | |||
return text; | |||
} | |||
public CHPX getChpx() { | |||
return chpx; | |||
} | |||
} | |||
} |
@@ -0,0 +1,25 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf; | |||
import org.apache.poi.OldFileFormatException; | |||
public class OldWordFileFormatException extends OldFileFormatException { | |||
public OldWordFileFormatException(String s) { | |||
super(s); | |||
} | |||
} |
@@ -0,0 +1,79 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf.extractor; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.POIOLE2TextExtractor; | |||
import org.apache.poi.hwpf.HWPFOldDocument; | |||
import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX; | |||
import org.apache.poi.hwpf.usermodel.Range; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
/** | |||
* Class to extract the text from old (Word 6 / Word 95) Word Documents. | |||
* | |||
* This should only be used on the older files, for most uses you | |||
* should call {@link WordExtractor} which deals properly | |||
* with HWPF. | |||
* | |||
* @author Nick Burch | |||
*/ | |||
public final class Word6Extractor extends POIOLE2TextExtractor { | |||
private POIFSFileSystem fs; | |||
private HWPFOldDocument doc; | |||
/** | |||
* Create a new Word Extractor | |||
* @param is InputStream containing the word file | |||
*/ | |||
public Word6Extractor(InputStream is) throws IOException { | |||
this( new POIFSFileSystem(is) ); | |||
} | |||
/** | |||
* Create a new Word Extractor | |||
* @param fs POIFSFileSystem containing the word file | |||
*/ | |||
public Word6Extractor(POIFSFileSystem fs) throws IOException { | |||
this(fs.getRoot(), fs); | |||
} | |||
public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { | |||
this(new HWPFOldDocument(dir,fs)); | |||
} | |||
/** | |||
* Create a new Word Extractor | |||
* @param doc The HWPFOldDocument to extract from | |||
*/ | |||
public Word6Extractor(HWPFOldDocument doc) { | |||
super(doc); | |||
this.doc = doc; | |||
} | |||
@Override | |||
public String getText() { | |||
StringBuffer text = new StringBuffer(); | |||
for(TextAndCHPX tchpx : doc.getContents()) { | |||
text.append( Range.stripFields(tchpx.getText()) ); | |||
} | |||
return text.toString(); | |||
} | |||
} |
@@ -40,7 +40,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
* You should use either getParagraphText() or getText() unless | |||
* you have a strong reason otherwise. | |||
* | |||
* @author Nick Burch (nick at torchbox dot com) | |||
* @author Nick Burch | |||
*/ | |||
public final class WordExtractor extends POIOLE2TextExtractor { | |||
private POIFSFileSystem fs; |
@@ -61,4 +61,9 @@ public final class CHPX extends BytePropertyNode | |||
CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0); | |||
return props; | |||
} | |||
public String toString() { | |||
return "CHPX from " + getStart() + " to " + getEnd() + | |||
" (in bytes " + getStartBytes() + " to " + getEndBytes() + ")"; | |||
} | |||
} |
@@ -0,0 +1,77 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hwpf.model; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
import org.apache.poi.poifs.common.POIFSConstants; | |||
import org.apache.poi.util.LittleEndian; | |||
/** | |||
* This class holds all of the character formatting | |||
* properties from Old (Word 6 / Word 95) documents. | |||
* Unlike with Word 97+, it all gets held in the | |||
* same stream. | |||
* In common with the rest of the old support, it | |||
* is read only | |||
*/ | |||
public final class OldCHPBinTable | |||
{ | |||
/** List of character properties.*/ | |||
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>(); | |||
/** | |||
* Constructor used to read an old-style binTable | |||
* in from a Word document. | |||
* | |||
* @param documentStream | |||
* @param offset | |||
* @param size | |||
* @param fcMin | |||
*/ | |||
public OldCHPBinTable(byte[] documentStream, int offset, | |||
int size, int fcMin, TextPieceTable tpt) | |||
{ | |||
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); | |||
int length = binTable.length(); | |||
for (int x = 0; x < length; x++) | |||
{ | |||
GenericPropertyNode node = binTable.getProperty(x); | |||
int pageNum = LittleEndian.getShort(node.getBytes()); | |||
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; | |||
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream, | |||
pageOffset, fcMin, tpt); | |||
int fkpSize = cfkp.size(); | |||
for (int y = 0; y < fkpSize; y++) | |||
{ | |||
_textRuns.add(cfkp.getCHPX(y)); | |||
} | |||
} | |||
} | |||
public List<CHPX> getTextRuns() | |||
{ | |||
return _textRuns; | |||
} | |||
} |
@@ -19,13 +19,12 @@ package org.apache.poi.hwpf.extractor; | |||
import junit.framework.TestCase; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.hwpf.HWPFDocument; | |||
import org.apache.poi.hwpf.HWPFTestDataSamples; | |||
import org.apache.poi.hwpf.OldWordFileFormatException; | |||
import org.apache.poi.poifs.filesystem.DirectoryNode; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.POIDataSamples; | |||
import java.io.FileInputStream; | |||
/** | |||
* Test the different routes to extracting text | |||
@@ -237,4 +236,42 @@ public final class TestWordExtractor extends TestCase { | |||
assertTrue(b.toString().contains("TestComment")); | |||
} | |||
public void testWord95() throws Exception { | |||
// Too old for the default | |||
try { | |||
extractor = new WordExtractor( | |||
POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc") | |||
); | |||
fail(); | |||
} catch(OldWordFileFormatException e) {} | |||
// Can work with the special one | |||
Word6Extractor w6e = new Word6Extractor( | |||
POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc") | |||
); | |||
String text = w6e.getText(); | |||
assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); | |||
assertTrue(text.contains("Paragraph 2")); | |||
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it")); | |||
assertTrue(text.contains("Last (4th) paragraph")); | |||
} | |||
public void testWord6() throws Exception { | |||
// Too old for the default | |||
try { | |||
extractor = new WordExtractor( | |||
POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc") | |||
); | |||
fail(); | |||
} catch(OldWordFileFormatException e) {} | |||
Word6Extractor w6e = new Word6Extractor( | |||
POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc") | |||
); | |||
String text = w6e.getText(); | |||
assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); | |||
} | |||
} |