From 999aecbaa1e9b04cc596e3d96ba6849cba85df6c Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 30 Jun 2010 15:13:10 +0000 Subject: [PATCH] Basic text extraction support for old Word 6 and Word 95 documents via some HWPF extensions git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@959346 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/org/apache/poi/hwpf/HWPFDocument.java | 105 +++++--------- .../org/apache/poi/hwpf/HWPFDocumentCore.java | 130 +++++++++++++++++ .../org/apache/poi/hwpf/HWPFOldDocument.java | 135 ++++++++++++++++++ .../poi/hwpf/OldWordFileFormatException.java | 25 ++++ .../poi/hwpf/extractor/Word6Extractor.java | 79 ++++++++++ .../poi/hwpf/extractor/WordExtractor.java | 2 +- .../src/org/apache/poi/hwpf/model/CHPX.java | 5 + .../apache/poi/hwpf/model/OldCHPBinTable.java | 77 ++++++++++ .../poi/hwpf/extractor/TestWordExtractor.java | 43 +++++- test-data/document/Word6.doc | Bin 0 -> 6656 bytes test-data/document/Word95.doc | Bin 0 -> 102400 bytes 11 files changed, 530 insertions(+), 71 deletions(-) create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java create mode 100644 test-data/document/Word6.doc create mode 100644 test-data/document/Word95.doc diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index f0d8b1d8ee..bd31f6253d 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -17,26 +17,43 @@ package org.apache.poi.hwpf; -import java.io.InputStream; +import java.io.ByteArrayInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.PushbackInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; -import java.io.ByteArrayInputStream; - import java.util.Iterator; -import org.apache.poi.EncryptedDocumentException; -import org.apache.poi.POIDocument; +import org.apache.poi.hwpf.model.CHPBinTable; +import org.apache.poi.hwpf.model.CPSplitCalculator; +import org.apache.poi.hwpf.model.ComplexFileTable; +import org.apache.poi.hwpf.model.DocumentProperties; +import org.apache.poi.hwpf.model.EscherRecordHolder; +import org.apache.poi.hwpf.model.FSPATable; +import org.apache.poi.hwpf.model.FileInformationBlock; +import org.apache.poi.hwpf.model.FontTable; +import org.apache.poi.hwpf.model.GenericPropertyNode; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.model.PAPBinTable; +import org.apache.poi.hwpf.model.PicturesTable; +import org.apache.poi.hwpf.model.PlexOfCps; +import org.apache.poi.hwpf.model.PropertyNode; +import org.apache.poi.hwpf.model.RevisionMarkAuthorTable; +import org.apache.poi.hwpf.model.SavedByTable; +import org.apache.poi.hwpf.model.SectionTable; +import org.apache.poi.hwpf.model.ShapesTable; +import org.apache.poi.hwpf.model.StyleSheet; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.model.TextPieceTable; +import org.apache.poi.hwpf.model.io.HWPFFileSystem; +import org.apache.poi.hwpf.model.io.HWPFOutputStream; +import org.apache.poi.hwpf.usermodel.HWPFList; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.DocumentEntry; -import org.apache.poi.poifs.common.POIFSConstants; - -import org.apache.poi.hwpf.model.*; -import org.apache.poi.hwpf.model.io.*; -import org.apache.poi.hwpf.usermodel.*; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -46,17 +63,11 @@ import org.apache.poi.hwpf.usermodel.*; * * @author Ryan Ackley */ -public final class HWPFDocument extends POIDocument -// implements Cloneable +public final class HWPFDocument extends HWPFDocumentCore { - /** The FIB */ - protected FileInformationBlock _fib; /** And for making sense of CP lengths in the FIB */ protected CPSplitCalculator _cpSplit; - /** main document stream buffer*/ - protected byte[] _mainStream; - /** table stream buffer*/ protected byte[] _tableStream; @@ -110,29 +121,7 @@ public final class HWPFDocument extends POIDocument protected HWPFDocument() { - super(null, null); - } - - /** - * Takens an InputStream, verifies that it's not RTF, builds a - * POIFSFileSystem from it, and returns that. - */ - public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException { - // Open a PushbackInputStream, so we can peek at the first few bytes - PushbackInputStream pis = new PushbackInputStream(istream,6); - byte[] first6 = new byte[6]; - pis.read(first6); - - // Does it start with {\rtf ? If so, it's really RTF - if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r' - && first6[3] == 't' && first6[4] == 'f') { - throw new IllegalArgumentException("The document is really a RTF file"); - } - - // OK, so it's not RTF - // Open a POIFSFileSystem on the (pushed back) stream - pis.unread(first6); - return new POIFSFileSystem(pis); + super(); } /** @@ -171,21 +160,16 @@ public final class HWPFDocument extends POIDocument */ public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException { - // Sort out the hpsf properties + // Load the main stream and FIB + // Also handles HPSF bits super(directory, pfilesystem); - // read in the main stream. - DocumentEntry documentProps = (DocumentEntry) - directory.getEntry("WordDocument"); - _mainStream = new byte[documentProps.getSize()]; - - directory.createDocumentInputStream("WordDocument").read(_mainStream); - - // Create our FIB, and check for the doc being encrypted - _fib = new FileInformationBlock(_mainStream); + // Do the CP Split _cpSplit = new CPSplitCalculator(_fib); - if(_fib.isFEncrypted()) { - throw new EncryptedDocumentException("Cannot process encrypted word files!"); + + // Is this document too old for us? + if(_fib.getNFib() < 106) { + throw new OldWordFileFormatException("The document is too old (Word 95 or older) "); } // use the fib to determine the name of the table stream. @@ -691,17 +675,4 @@ public final class HWPFDocument extends POIDocument t.printStackTrace(); } } - -// public Object clone() -// throws CloneNotSupportedException -// { -// _tpt; -// -// _cbt; -// -// _pbt; -// -// _st; -// -// } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java new file mode 100644 index 0000000000..af17cc2ed2 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java @@ -0,0 +1,130 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; + +import org.apache.poi.EncryptedDocumentException; +import org.apache.poi.POIDocument; +import org.apache.poi.hwpf.model.FileInformationBlock; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + + +/** + * This class holds much of the core of a Word document, but + * without some of the table structure information. + * You generally want to work with one of + * {@link HWPFDocument} or {@link HWPFOldDocument} + */ +public abstract class HWPFDocumentCore extends POIDocument +{ + /** The FIB */ + protected FileInformationBlock _fib; + + /** main document stream buffer*/ + protected byte[] _mainStream; + + protected HWPFDocumentCore() + { + super(null, null); + } + + /** + * Takens an InputStream, verifies that it's not RTF, builds a + * POIFSFileSystem from it, and returns that. + */ + public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException { + // Open a PushbackInputStream, so we can peek at the first few bytes + PushbackInputStream pis = new PushbackInputStream(istream,6); + byte[] first6 = new byte[6]; + pis.read(first6); + + // Does it start with {\rtf ? If so, it's really RTF + if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r' + && first6[3] == 't' && first6[4] == 'f') { + throw new IllegalArgumentException("The document is really a RTF file"); + } + + // OK, so it's not RTF + // Open a POIFSFileSystem on the (pushed back) stream + pis.unread(first6); + return new POIFSFileSystem(pis); + } + + /** + * This constructor loads a Word document from an InputStream. + * + * @param istream The InputStream that contains the Word document. + * @throws IOException If there is an unexpected IOException from the passed + * in InputStream. + */ + public HWPFDocumentCore(InputStream istream) throws IOException + { + //do Ole stuff + this( verifyAndBuildPOIFS(istream) ); + } + + /** + * This constructor loads a Word document from a POIFSFileSystem + * + * @param pfilesystem The POIFSFileSystem that contains the Word document. + * @throws IOException If there is an unexpected IOException from the passed + * in POIFSFileSystem. + */ + public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException + { + this(pfilesystem.getRoot(), pfilesystem); + } + + /** + * This constructor loads a Word document from a specific point + * in a POIFSFileSystem, probably not the default. + * Used typically to open embeded documents. + * + * @param pfilesystem The POIFSFileSystem that contains the Word document. + * @throws IOException If there is an unexpected IOException from the passed + * in POIFSFileSystem. + */ + public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException + { + // Sort out the hpsf properties + super(directory, pfilesystem); + + // read in the main stream. + DocumentEntry documentProps = (DocumentEntry) + directory.getEntry("WordDocument"); + _mainStream = new byte[documentProps.getSize()]; + + directory.createDocumentInputStream("WordDocument").read(_mainStream); + + // Create our FIB, and check for the doc being encrypted + _fib = new FileInformationBlock(_mainStream); + if(_fib.isFEncrypted()) { + throw new EncryptedDocumentException("Cannot process encrypted word files!"); + } + } + + public FileInformationBlock getFileInformationBlock() + { + return _fib; + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java new file mode 100644 index 0000000000..42cff2ace8 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -0,0 +1,135 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.hwpf.model.CHPX; +import org.apache.poi.hwpf.model.ComplexFileTable; +import org.apache.poi.hwpf.model.OldCHPBinTable; +import org.apache.poi.hwpf.model.PieceDescriptor; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.model.TextPieceTable; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.LittleEndian; + +/** + * Provides very simple support for old (Word 6 / Word 95) + * files. + * TODO Provide a way to get at the properties associated + * with each block of text + */ +public class HWPFOldDocument extends HWPFDocumentCore { + private List contents = new ArrayList(); + + public HWPFOldDocument(POIFSFileSystem fs) throws IOException { + this(fs.getRoot(), fs); + } + + public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs) + throws IOException { + super(directory, fs); + + // Where are things? + int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8); + int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); + int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); + + // We need to get hold of the text that makes up the + // document, which might be regular or fast-saved + StringBuffer text = new StringBuffer(); + TextPieceTable tpt; + if(_fib.isFComplex()) { + ComplexFileTable cft = new ComplexFileTable( + _mainStream, _mainStream, + complexTableOffset, _fib.getFcMin() + ); + tpt = cft.getTextPieceTable(); + + for(TextPiece tp : tpt.getTextPieces()) { + text.append( tp.getStringBuffer() ); + } + } else { + // TODO Build the Piece Descriptor properly + // TODO Can these old documents ever contain Unicode strings? + PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0); + pd.setFilePosition(_fib.getFcMin()); + + tpt = new TextPieceTable(); + byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()]; + System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length); + TextPiece tp = new TextPiece( + 0, textData.length, textData, pd, 0 + ); + tpt.getTextPieces().add(tp); + + text.append(tp.getStringBuffer()); + } + + // Now we can fetch the character and paragraph properties + OldCHPBinTable chpTable = new OldCHPBinTable( + _mainStream, chpTableOffset, chpTableSize, + _fib.getFcMin(), tpt + ); + + // Finally build up runs + for(CHPX chpx : chpTable.getTextRuns()) { + String str = text.substring(chpx.getStart(), chpx.getEnd()); + contents.add(new TextAndCHPX(str,chpx)); + } + } + + @Override + public void write(OutputStream out) throws IOException { + throw new IllegalStateException("Writing is not available for the older file formats"); + } + + /** + * Retrieves all our text, in order, along with the + * CHPX information on each bit. + * Every entry has the same formatting, but as yet + * we've no way to tell what the formatting is... + * Warnings - this will change as soon as we support + * text formatting! + */ + public List getContents() { + return contents; + } + + /** + * Warnings - this will change as soon as we support + * text formatting! + */ + public static class TextAndCHPX { + private String text; + private CHPX chpx; + private TextAndCHPX(String text, CHPX chpx) { + this.text = text; + this.chpx = chpx; + } + public String getText() { + return text; + } + public CHPX getChpx() { + return chpx; + } + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java b/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java new file mode 100644 index 0000000000..cfa97bc70e --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java @@ -0,0 +1,25 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf; + +import org.apache.poi.OldFileFormatException; + +public class OldWordFileFormatException extends OldFileFormatException { + public OldWordFileFormatException(String s) { + super(s); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java new file mode 100644 index 0000000000..3ea6d42d46 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java @@ -0,0 +1,79 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.extractor; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.poi.POIOLE2TextExtractor; +import org.apache.poi.hwpf.HWPFOldDocument; +import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * Class to extract the text from old (Word 6 / Word 95) Word Documents. + * + * This should only be used on the older files, for most uses you + * should call {@link WordExtractor} which deals properly + * with HWPF. + * + * @author Nick Burch + */ +public final class Word6Extractor extends POIOLE2TextExtractor { + private POIFSFileSystem fs; + private HWPFOldDocument doc; + + /** + * Create a new Word Extractor + * @param is InputStream containing the word file + */ + public Word6Extractor(InputStream is) throws IOException { + this( new POIFSFileSystem(is) ); + } + + /** + * Create a new Word Extractor + * @param fs POIFSFileSystem containing the word file + */ + public Word6Extractor(POIFSFileSystem fs) throws IOException { + this(fs.getRoot(), fs); + } + public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + this(new HWPFOldDocument(dir,fs)); + } + + /** + * Create a new Word Extractor + * @param doc The HWPFOldDocument to extract from + */ + public Word6Extractor(HWPFOldDocument doc) { + super(doc); + this.doc = doc; + } + + @Override + public String getText() { + StringBuffer text = new StringBuffer(); + for(TextAndCHPX tchpx : doc.getContents()) { + text.append( Range.stripFields(tchpx.getText()) ); + } + return text.toString(); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index 1be78ab1a6..fe57f7c474 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -40,7 +40,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * You should use either getParagraphText() or getText() unless * you have a strong reason otherwise. * - * @author Nick Burch (nick at torchbox dot com) + * @author Nick Burch */ public final class WordExtractor extends POIOLE2TextExtractor { private POIFSFileSystem fs; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java index b78cdffc57..f56621afad 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java @@ -61,4 +61,9 @@ public final class CHPX extends BytePropertyNode CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0); return props; } + + public String toString() { + return "CHPX from " + getStart() + " to " + getEnd() + + " (in bytes " + getStartBytes() + " to " + getEndBytes() + ")"; + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java new file mode 100644 index 0000000000..3c97652e55 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java @@ -0,0 +1,77 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.poifs.common.POIFSConstants; +import org.apache.poi.util.LittleEndian; + +/** + * This class holds all of the character formatting + * properties from Old (Word 6 / Word 95) documents. + * Unlike with Word 97+, it all gets held in the + * same stream. + * In common with the rest of the old support, it + * is read only + */ +public final class OldCHPBinTable +{ + /** List of character properties.*/ + protected ArrayList _textRuns = new ArrayList(); + + /** + * Constructor used to read an old-style binTable + * in from a Word document. + * + * @param documentStream + * @param offset + * @param size + * @param fcMin + */ + public OldCHPBinTable(byte[] documentStream, int offset, + int size, int fcMin, TextPieceTable tpt) + { + PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); + + int length = binTable.length(); + for (int x = 0; x < length; x++) + { + GenericPropertyNode node = binTable.getProperty(x); + + int pageNum = LittleEndian.getShort(node.getBytes()); + int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; + + CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream, + pageOffset, fcMin, tpt); + + int fkpSize = cfkp.size(); + + for (int y = 0; y < fkpSize; y++) + { + _textRuns.add(cfkp.getCHPX(y)); + } + } + } + + public List getTextRuns() + { + return _textRuns; + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index 47e0b431f6..9634ab9d40 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -19,13 +19,12 @@ package org.apache.poi.hwpf.extractor; import junit.framework.TestCase; +import org.apache.poi.POIDataSamples; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFTestDataSamples; +import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.POIDataSamples; - -import java.io.FileInputStream; /** * Test the different routes to extracting text @@ -237,4 +236,42 @@ public final class TestWordExtractor extends TestCase { assertTrue(b.toString().contains("TestComment")); } + + public void testWord95() throws Exception { + // Too old for the default + try { + extractor = new WordExtractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc") + ); + fail(); + } catch(OldWordFileFormatException e) {} + + // Can work with the special one + Word6Extractor w6e = new Word6Extractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc") + ); + String text = w6e.getText(); + + assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); + assertTrue(text.contains("Paragraph 2")); + assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it")); + assertTrue(text.contains("Last (4th) paragraph")); + } + + public void testWord6() throws Exception { + // Too old for the default + try { + extractor = new WordExtractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc") + ); + fail(); + } catch(OldWordFileFormatException e) {} + + Word6Extractor w6e = new Word6Extractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc") + ); + String text = w6e.getText(); + + assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); + } } diff --git a/test-data/document/Word6.doc b/test-data/document/Word6.doc new file mode 100644 index 0000000000000000000000000000000000000000..a614a0783f049de3e5ce5e61d6dedaa70ce3bcc9 GIT binary patch literal 6656 zcmeHLO>7%Q6n^X2wcWI-^JmhukRg<$(8LKXst`X?lA1IXG)0X?IZ%nUy^h_iy=J{` z@cJ(@QEvIgquirXvoUpl`V+5cP6cPr2ZYy)rwP&d#6^a6cAKi~%*0S18Wz#y;# z*aLEGI`Z<3PIO)V(=6NTa3W$F86#$iV}=@g;h^PYGSIq!=(L0*S)NE(kh36 zLB@dIza7qUBkQ$pg5iQJSt+m6AqWHW`PFE81N1Cf*v?D+bbK$AKK_IJd+-mn#Mgm6 zpvZmEQulVJmNd}B`LMN*F0V{nS$*jYMMf0T*Z18jINhM|E2XoOr zWlZU$FdZj~43JuM8LH+qLeo=gg-eJawoivMs&#sN!GgD>n$b)oZYE-uYGS-{8m9cv zTHzKV5Wme$yr!D!qN%3iV%C6<%h5=FNTD8OgxqtQ8ig~)H@}bi|0Mr1d6Z3b+)0t0 zBxuIRbb`DgZP^lGBMQ&Tg9r&R5h&TenDBKg!%81HUrl5z_*)z~702Jp;~!AdR>DZ} z_`TG>%)Po47f<(f1<@4CF{2ES>rg7HVL&HF?6SQt_KBQwC)vI{_7$J0@agOw)&aky z-~S=#Ly4r85%bz9anwkvDcXWs2QB21VMC|R5hj^76RIxB&)h@K6lZLT88B5mA0X0O z0jIbWCim0FXU|_gKMRR$cLF|%4B(3Hg_GW47Sg5Wu&!7uyW(pU;x>_L`TvlH95;Ws zc_FaFckUDT|6{kW22tm#s(TZ49H%mS2FJAAmwotJ^=SUzq<{1MN1CuIhFnikY~d*m zSdbJtfcK{)rRfOv_K9L;6aPK17Tb%*lre0BeZw}L0>1j)kw=3g6v;M&du5E>NN1L} z&n9$XS7tH)B&qW4?GRq!V$g^wYb;8jj02%&+7133Jn_42qw{B7y=(8cW0F^tg2*FM z5>37X)?`@KFxbNUIc_S&b*_6jH>Z#1THwDA{O`I3s^-+}XS4RNe1B_}*Lw8-g#TY% C!;k#{ literal 0 HcmV?d00001 diff --git a/test-data/document/Word95.doc b/test-data/document/Word95.doc new file mode 100644 index 0000000000000000000000000000000000000000..a214f292585a7f1c7a8358627186df8b77af0eb8 GIT binary patch literal 102400 zcmeI&32+s69S88==4cWqG+Z%-YPYCBIYNb6fm$UrP_Ph?qlkjK$$Lp2y!SSJ2jpRq z*xG^!1f+<_eT2495d=9zAqZtk2N8y;jHM1pIfQl?2*(hn$@=~8dpUebnsAh%_WNhP z+1>yC_rL%B?7rRY>Y=~4II&`5+kZ<`c~WAf!b+0+YShPI{;kU8WD&KjAr+QoRaO^b z28%485rOi^IMq2U!MTe^)HzQ;5|JdN3DOj~A8Cd(M;<_M2}ld1CDICMjkG~hkS`#q z$b(2*q#g1Q@4@=Jjy!>U1L=kIM*1L6BHu)wLcWE38|jOrBmI!4kqo3iG5~o78Hfx*zJolA3`T|^ znaEIN81h|YI5GkmiHt&?Lq;RtL&hNAM}B~eMaCgNM1F+)7%`A6BpY!cP9z7(Me-0A zG9Hg0ek6bdk%`D8qyPyalaZewFCbHppCVI{pCQB1lZh>L9Jv=$ zpS=ZEG_s`Nim7?2H)gY%_lw4~Z>SCTUoFs(m0DlZUyCfOx;V_=>%V_BPWAg&Q$I$8@%dCl42k$Qeyv=0qTA|#Fgzc{bo);A8q;^`V(o7b;~d(1U*h~zA=;~<&4^4Zw|f=K>OOt9N`P|y95c~1TLpeuWVp5-$qdG#E#Kp!9U zrP3Q&Lo6rcbFC_n)UP=Epypa2CZ zKmiI+fC3bt00k&O0SZun0u-PC1t>rP3Q&Lo6rcbFC_n)UP=Epypa2CZKmiI+fC3bt z00k&O0SZun0u-PC1t>rP3Q&Lo6rcbFC_n)UP=Epypa2CZKmiI+fC3bt00k&O0SZun z0u-PC1t>rP3Q&Lo6rcbFC_n)UP=Epypa2CZKmiI+fC3bt00k&O0SZun0u-PC1t>rP z3Q&Lo6rcbFC_n)UP=Epypa2CZKmiI+fC3bt00k&O0SZun0u;DSf%E6jqlePc(koZ4 z+-6tYyiS1|H*Rd+ycw(Z?AcRTSh!=yj(^#uq@?8Z>C<_6dFRfZ+qrY+iWMu+0E_F^ zt;4GH^z6pw2#7c~b&71)tEq+%9N<=|>-Z^a(AwiUJg%00r){Kv`MYwr$(6 zQ#>f0PN&D?Iez^3j2SbI9zB|qlYsZ ztXaBr>Bfy4_wCzfUB7OHLe`iuR#}%V)6}Wf-o2_vth#p1nl{aP=N)u~ zP0%f7u3x`Cb?Q`fh*6<`wad>=xpe7L)nKchlBkJ#P}cX*#IaXxe)VeAtabVFv6~Y8Ulw$a)G;fC3btz#SK8%zMkhg9p!?IdjK% z{dw-Nk?$>^XCfXu1t>rP3Q(X_v+fZQs6q)v8sf zS-Em0zKx*G^y$;_0)zQw%a&mgUtO+Vy^6OQw8STx_3PK;f7B={D#AkJ zJc4fsQ>IM8XPdoy_Z~ic7+-Ag;RT;}!uy&zb7m+MI(6z)on!kyt_*)#iTe8S@y8$I z^GMWJmzzQecZeR&o;`~Vae^*hyjZh`@D~^~K&e^Jc?wW~0u-PC1t>rP3Q&Lo6rcbF zC_n)UP=Epypuimz5EcqhfC3bt00k&O0SZun0u;FC1=Rn4=tYe3CFJMG%g8IpG-NvR z3uFfJOJpW83z>}+A#;$q$ghxj$g9YF4Tfk>4X5kd4S6kWI*DrP3Q&Lo6rcbF zC_n)UP=Epypa2CZKmiI+fC3bt00k&O0SZun0u-PC1t>rP3Q&Lo6rcbFC_n)UP=Epy zpa2CZKmiI+fC3bt00k&O0SZun0u-PC1t>rP3Q&Lo6rcbFC_n)UP=Epypa2CZKmiI+ zfC3bt00k&O0SZun0u-PC1t>rP3Q&Lo6rcbFC_n)UP=Epypa2CZK!G|0lADN(tfRi+ zsu+nC4gXl~rohNNr~Z7C*#P?KUQdbcdPStl2Qb zXXN^f{5-u!&4r$6`ar|4`%RBiAKpJh4>$_~y5V(%Yx)fy)nD&7ba15J<<(t*v}Vl) z8~%Xa>G43`qk4X2w`t8<%j!eji%&eYCNbf)#Dv$E$G@>We%`0*5i8Oa>4x-1`XIxQ z5l9v?0VzO2NFnkfQiK#EF{9KpKI=aG zjYXt6ZaTHjf93gB=eZ)h-^NdFd7RwvZVTX9lh9mJBoR-YXXOEV_nk$OFIh3tSrRgw zIY!VO5Uj^FBR#dQT2slAadDEJDn?RV9mDM)>3vJB71BW-OwaNgfyv!Q1aKoXd=7th zp3jvNFns8*x^$3=hfA$X4YqG@x6`!tk`ft}UVSefV0u-jsxM92YaJwEpwn=;ytyJx zq=mG(C@BS!sNE;y;wAeb(WPIqYNt!mNOhymG94jxj4h>&sFotFrF-%tS`V$Y#0_@& z0}>;xs`~Gz`hVKU54cRP>c6$LDNtKA*U)*YJf}&jJ&xg8s>BcSI-KY*4m~%IJlKoS zQ=EMV6|qp%?@!%JWAK9v-&tej1!71kwH$8?|?niBmj0kzMOt&=c zpqixnT!vdZsphRRU7oC`{k~ z%ztI!>V*R^^vL^3JIqbDm(*1dCoj!W)96}Mp>)J?)n6(s)hBipRa8yPyMKFkaaz0N zc?%MyOV_gX8K{eosvC`5sp-OzM mJ>Q&1uUpuEH|+O+ZsYbyZv9W?xo@n0MI+BsV@Gs5`~NS-d6QxQ literal 0 HcmV?d00001 -- 2.39.5