package org.apache.poi.hwpf;
-import java.io.InputStream;
+import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
-import java.io.PushbackInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStream;
-import java.io.ByteArrayInputStream;
-
import java.util.Iterator;
-import org.apache.poi.EncryptedDocumentException;
-import org.apache.poi.POIDocument;
+import org.apache.poi.hwpf.model.CHPBinTable;
+import org.apache.poi.hwpf.model.CPSplitCalculator;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.DocumentProperties;
+import org.apache.poi.hwpf.model.EscherRecordHolder;
+import org.apache.poi.hwpf.model.FSPATable;
+import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.hwpf.model.FontTable;
+import org.apache.poi.hwpf.model.GenericPropertyNode;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.model.PAPBinTable;
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.PlexOfCps;
+import org.apache.poi.hwpf.model.PropertyNode;
+import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
+import org.apache.poi.hwpf.model.SavedByTable;
+import org.apache.poi.hwpf.model.SectionTable;
+import org.apache.poi.hwpf.model.ShapesTable;
+import org.apache.poi.hwpf.model.StyleSheet;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.model.io.HWPFFileSystem;
+import org.apache.poi.hwpf.model.io.HWPFOutputStream;
+import org.apache.poi.hwpf.usermodel.HWPFList;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.common.POIFSConstants;
-
-import org.apache.poi.hwpf.model.*;
-import org.apache.poi.hwpf.model.io.*;
-import org.apache.poi.hwpf.usermodel.*;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
*
* @author Ryan Ackley
*/
-public final class HWPFDocument extends POIDocument
-// implements Cloneable
+public final class HWPFDocument extends HWPFDocumentCore
{
- /** The FIB */
- protected FileInformationBlock _fib;
/** And for making sense of CP lengths in the FIB */
protected CPSplitCalculator _cpSplit;
- /** main document stream buffer*/
- protected byte[] _mainStream;
-
/** table stream buffer*/
protected byte[] _tableStream;
protected HWPFDocument()
{
- super(null, null);
- }
-
- /**
- * Takens an InputStream, verifies that it's not RTF, builds a
- * POIFSFileSystem from it, and returns that.
- */
- public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
- // Open a PushbackInputStream, so we can peek at the first few bytes
- PushbackInputStream pis = new PushbackInputStream(istream,6);
- byte[] first6 = new byte[6];
- pis.read(first6);
-
- // Does it start with {\rtf ? If so, it's really RTF
- if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
- && first6[3] == 't' && first6[4] == 'f') {
- throw new IllegalArgumentException("The document is really a RTF file");
- }
-
- // OK, so it's not RTF
- // Open a POIFSFileSystem on the (pushed back) stream
- pis.unread(first6);
- return new POIFSFileSystem(pis);
+ super();
}
/**
*/
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{
- // Sort out the hpsf properties
+ // Load the main stream and FIB
+ // Also handles HPSF bits
super(directory, pfilesystem);
- // read in the main stream.
- DocumentEntry documentProps = (DocumentEntry)
- directory.getEntry("WordDocument");
- _mainStream = new byte[documentProps.getSize()];
-
- directory.createDocumentInputStream("WordDocument").read(_mainStream);
-
- // Create our FIB, and check for the doc being encrypted
- _fib = new FileInformationBlock(_mainStream);
+ // Do the CP Split
_cpSplit = new CPSplitCalculator(_fib);
- if(_fib.isFEncrypted()) {
- throw new EncryptedDocumentException("Cannot process encrypted word files!");
+
+ // Is this document too old for us?
+ if(_fib.getNFib() < 106) {
+ throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
}
// use the fib to determine the name of the table stream.
t.printStackTrace();
}
}
-
-// public Object clone()
-// throws CloneNotSupportedException
-// {
-// _tpt;
-//
-// _cbt;
-//
-// _pbt;
-//
-// _st;
-//
-// }
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.POIDocument;
+import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+
+/**
+ * This class holds much of the core of a Word document, but
+ * without some of the table structure information.
+ * You generally want to work with one of
+ * {@link HWPFDocument} or {@link HWPFOldDocument}
+ */
+public abstract class HWPFDocumentCore extends POIDocument
+{
+ /** The FIB */
+ protected FileInformationBlock _fib;
+
+ /** main document stream buffer*/
+ protected byte[] _mainStream;
+
+ protected HWPFDocumentCore()
+ {
+ super(null, null);
+ }
+
+ /**
+ * Takens an InputStream, verifies that it's not RTF, builds a
+ * POIFSFileSystem from it, and returns that.
+ */
+ public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
+ // Open a PushbackInputStream, so we can peek at the first few bytes
+ PushbackInputStream pis = new PushbackInputStream(istream,6);
+ byte[] first6 = new byte[6];
+ pis.read(first6);
+
+ // Does it start with {\rtf ? If so, it's really RTF
+ if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
+ && first6[3] == 't' && first6[4] == 'f') {
+ throw new IllegalArgumentException("The document is really a RTF file");
+ }
+
+ // OK, so it's not RTF
+ // Open a POIFSFileSystem on the (pushed back) stream
+ pis.unread(first6);
+ return new POIFSFileSystem(pis);
+ }
+
+ /**
+ * This constructor loads a Word document from an InputStream.
+ *
+ * @param istream The InputStream that contains the Word document.
+ * @throws IOException If there is an unexpected IOException from the passed
+ * in InputStream.
+ */
+ public HWPFDocumentCore(InputStream istream) throws IOException
+ {
+ //do Ole stuff
+ this( verifyAndBuildPOIFS(istream) );
+ }
+
+ /**
+ * This constructor loads a Word document from a POIFSFileSystem
+ *
+ * @param pfilesystem The POIFSFileSystem that contains the Word document.
+ * @throws IOException If there is an unexpected IOException from the passed
+ * in POIFSFileSystem.
+ */
+ public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException
+ {
+ this(pfilesystem.getRoot(), pfilesystem);
+ }
+
+ /**
+ * This constructor loads a Word document from a specific point
+ * in a POIFSFileSystem, probably not the default.
+ * Used typically to open embeded documents.
+ *
+ * @param pfilesystem The POIFSFileSystem that contains the Word document.
+ * @throws IOException If there is an unexpected IOException from the passed
+ * in POIFSFileSystem.
+ */
+ public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
+ {
+ // Sort out the hpsf properties
+ super(directory, pfilesystem);
+
+ // read in the main stream.
+ DocumentEntry documentProps = (DocumentEntry)
+ directory.getEntry("WordDocument");
+ _mainStream = new byte[documentProps.getSize()];
+
+ directory.createDocumentInputStream("WordDocument").read(_mainStream);
+
+ // Create our FIB, and check for the doc being encrypted
+ _fib = new FileInformationBlock(_mainStream);
+ if(_fib.isFEncrypted()) {
+ throw new EncryptedDocumentException("Cannot process encrypted word files!");
+ }
+ }
+
+ public FileInformationBlock getFileInformationBlock()
+ {
+ return _fib;
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.OldCHPBinTable;
+import org.apache.poi.hwpf.model.PieceDescriptor;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * Provides very simple support for old (Word 6 / Word 95)
+ * files.
+ * TODO Provide a way to get at the properties associated
+ * with each block of text
+ */
+public class HWPFOldDocument extends HWPFDocumentCore {
+ private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>();
+
+ public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
+ this(fs.getRoot(), fs);
+ }
+
+ public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
+ throws IOException {
+ super(directory, fs);
+
+ // Where are things?
+ int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
+ int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
+ int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
+
+ // We need to get hold of the text that makes up the
+ // document, which might be regular or fast-saved
+ StringBuffer text = new StringBuffer();
+ TextPieceTable tpt;
+ if(_fib.isFComplex()) {
+ ComplexFileTable cft = new ComplexFileTable(
+ _mainStream, _mainStream,
+ complexTableOffset, _fib.getFcMin()
+ );
+ tpt = cft.getTextPieceTable();
+
+ for(TextPiece tp : tpt.getTextPieces()) {
+ text.append( tp.getStringBuffer() );
+ }
+ } else {
+ // TODO Build the Piece Descriptor properly
+ // TODO Can these old documents ever contain Unicode strings?
+ PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
+ pd.setFilePosition(_fib.getFcMin());
+
+ tpt = new TextPieceTable();
+ byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
+ System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
+ TextPiece tp = new TextPiece(
+ 0, textData.length, textData, pd, 0
+ );
+ tpt.getTextPieces().add(tp);
+
+ text.append(tp.getStringBuffer());
+ }
+
+ // Now we can fetch the character and paragraph properties
+ OldCHPBinTable chpTable = new OldCHPBinTable(
+ _mainStream, chpTableOffset, chpTableSize,
+ _fib.getFcMin(), tpt
+ );
+
+ // Finally build up runs
+ for(CHPX chpx : chpTable.getTextRuns()) {
+ String str = text.substring(chpx.getStart(), chpx.getEnd());
+ contents.add(new TextAndCHPX(str,chpx));
+ }
+ }
+
+ @Override
+ public void write(OutputStream out) throws IOException {
+ throw new IllegalStateException("Writing is not available for the older file formats");
+ }
+
+ /**
+ * Retrieves all our text, in order, along with the
+ * CHPX information on each bit.
+ * Every entry has the same formatting, but as yet
+ * we've no way to tell what the formatting is...
+ * Warnings - this will change as soon as we support
+ * text formatting!
+ */
+ public List<TextAndCHPX> getContents() {
+ return contents;
+ }
+
+ /**
+ * Warnings - this will change as soon as we support
+ * text formatting!
+ */
+ public static class TextAndCHPX {
+ private String text;
+ private CHPX chpx;
+ private TextAndCHPX(String text, CHPX chpx) {
+ this.text = text;
+ this.chpx = chpx;
+ }
+ public String getText() {
+ return text;
+ }
+ public CHPX getChpx() {
+ return chpx;
+ }
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import org.apache.poi.OldFileFormatException;
+
+public class OldWordFileFormatException extends OldFileFormatException {
+ public OldWordFileFormatException(String s) {
+ super(s);
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Class to extract the text from old (Word 6 / Word 95) Word Documents.
+ *
+ * This should only be used on the older files, for most uses you
+ * should call {@link WordExtractor} which deals properly
+ * with HWPF.
+ *
+ * @author Nick Burch
+ */
+public final class Word6Extractor extends POIOLE2TextExtractor {
+ private POIFSFileSystem fs;
+ private HWPFOldDocument doc;
+
+ /**
+ * Create a new Word Extractor
+ * @param is InputStream containing the word file
+ */
+ public Word6Extractor(InputStream is) throws IOException {
+ this( new POIFSFileSystem(is) );
+ }
+
+ /**
+ * Create a new Word Extractor
+ * @param fs POIFSFileSystem containing the word file
+ */
+ public Word6Extractor(POIFSFileSystem fs) throws IOException {
+ this(fs.getRoot(), fs);
+ }
+ public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HWPFOldDocument(dir,fs));
+ }
+
+ /**
+ * Create a new Word Extractor
+ * @param doc The HWPFOldDocument to extract from
+ */
+ public Word6Extractor(HWPFOldDocument doc) {
+ super(doc);
+ this.doc = doc;
+ }
+
+ @Override
+ public String getText() {
+ StringBuffer text = new StringBuffer();
+ for(TextAndCHPX tchpx : doc.getContents()) {
+ text.append( Range.stripFields(tchpx.getText()) );
+ }
+ return text.toString();
+ }
+}
* You should use either getParagraphText() or getText() unless
* you have a strong reason otherwise.
*
- * @author Nick Burch (nick at torchbox dot com)
+ * @author Nick Burch
*/
public final class WordExtractor extends POIOLE2TextExtractor {
private POIFSFileSystem fs;
CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0);
return props;
}
+
+ public String toString() {
+ return "CHPX from " + getStart() + " to " + getEnd() +
+ " (in bytes " + getStartBytes() + " to " + getEndBytes() + ")";
+ }
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the character formatting
+ * properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ * same stream.
+ * In common with the rest of the old support, it
+ * is read only
+ */
+public final class OldCHPBinTable
+{
+ /** List of character properties.*/
+ protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
+
+ /**
+ * Constructor used to read an old-style binTable
+ * in from a Word document.
+ *
+ * @param documentStream
+ * @param offset
+ * @param size
+ * @param fcMin
+ */
+ public OldCHPBinTable(byte[] documentStream, int offset,
+ int size, int fcMin, TextPieceTable tpt)
+ {
+ PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
+
+ int length = binTable.length();
+ for (int x = 0; x < length; x++)
+ {
+ GenericPropertyNode node = binTable.getProperty(x);
+
+ int pageNum = LittleEndian.getShort(node.getBytes());
+ int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
+
+ CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
+ pageOffset, fcMin, tpt);
+
+ int fkpSize = cfkp.size();
+
+ for (int y = 0; y < fkpSize; y++)
+ {
+ _textRuns.add(cfkp.getCHPX(y));
+ }
+ }
+ }
+
+ public List<CHPX> getTextRuns()
+ {
+ return _textRuns;
+ }
+}
import junit.framework.TestCase;
+import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFTestDataSamples;
+import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.POIDataSamples;
-
-import java.io.FileInputStream;
/**
* Test the different routes to extracting text
assertTrue(b.toString().contains("TestComment"));
}
+
+ public void testWord95() throws Exception {
+ // Too old for the default
+ try {
+ extractor = new WordExtractor(
+ POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
+ );
+ fail();
+ } catch(OldWordFileFormatException e) {}
+
+ // Can work with the special one
+ Word6Extractor w6e = new Word6Extractor(
+ POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
+ );
+ String text = w6e.getText();
+
+ assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+ assertTrue(text.contains("Paragraph 2"));
+ assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
+ assertTrue(text.contains("Last (4th) paragraph"));
+ }
+
+ public void testWord6() throws Exception {
+ // Too old for the default
+ try {
+ extractor = new WordExtractor(
+ POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
+ );
+ fail();
+ } catch(OldWordFileFormatException e) {}
+
+ Word6Extractor w6e = new Word6Extractor(
+ POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
+ );
+ String text = w6e.getText();
+
+ assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+ }
}