<changes>
<release version="3.7-beta2" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Paragraph level as well as whole-file text extraction for Word 6/95 files through HWPF</action>
<action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
<action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
<action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
import org.apache.poi.hwpf.model.DocumentProperties;
import org.apache.poi.hwpf.model.EscherRecordHolder;
import org.apache.poi.hwpf.model.FSPATable;
-import org.apache.poi.hwpf.model.FileInformationBlock;
import org.apache.poi.hwpf.model.FontTable;
import org.apache.poi.hwpf.model.GenericPropertyNode;
import org.apache.poi.hwpf.model.ListTables;
protected TextPieceTable _tpt;
- /** Contains formatting properties for text*/
- protected CHPBinTable _cbt;
-
- /** Contains formatting properties for paragraphs*/
- protected PAPBinTable _pbt;
-
- /** Contains formatting properties for sections.*/
- protected SectionTable _st;
-
- /** Holds styles for this document.*/
- protected StyleSheet _ss;
-
- /** Holds fonts for this document.*/
- protected FontTable _ft;
-
- /** Hold list tables */
- protected ListTables _lt;
-
/** Holds the save history for this document. */
protected SavedByTable _sbt;
}
}
- public StyleSheet getStyleSheet()
+ public TextPieceTable getTextTable()
{
- return _ss;
+ return _cft.getTextPieceTable();
}
- public FileInformationBlock getFileInformationBlock()
- {
- return _fib;
- }
public CPSplitCalculator getCPSplitCalculator()
{
return _cpSplit;
return length;
}
- public ListTables getListTables()
- {
- return _lt;
- }
-
/**
* Gets a reference to the saved -by table, which holds the save history for the document.
*
pfs.writeFilesystem(out);
}
- public CHPBinTable getCharacterTable()
- {
- return _cbt;
- }
-
- public PAPBinTable getParagraphTable()
- {
- return _pbt;
- }
-
- public SectionTable getSectionTable()
- {
- return _st;
- }
-
- public TextPieceTable getTextTable()
- {
- return _cft.getTextPieceTable();
- }
-
public byte[] getDataStream()
{
return _dataStream;
return _lt.addList(list.getListData(), list.getOverride());
}
- public FontTable getFontTable()
- {
- return _ft;
- }
-
public void delete(int start, int length)
{
Range r = new Range(start, start + length, this);
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIDocument;
+import org.apache.poi.hwpf.model.CHPBinTable;
import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.hwpf.model.FontTable;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.model.PAPBinTable;
+import org.apache.poi.hwpf.model.SectionTable;
+import org.apache.poi.hwpf.model.StyleSheet;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/** The FIB */
protected FileInformationBlock _fib;
+ /** Holds styles for this document.*/
+ protected StyleSheet _ss;
+
+ /** Contains formatting properties for text*/
+ protected CHPBinTable _cbt;
+
+ /** Contains formatting properties for paragraphs*/
+ protected PAPBinTable _pbt;
+
+ /** Contains formatting properties for sections.*/
+ protected SectionTable _st;
+
+ /** Holds fonts for this document.*/
+ protected FontTable _ft;
+
+ /** Hold list tables */
+ protected ListTables _lt;
+
/** main document stream buffer*/
protected byte[] _mainStream;
}
}
+ /**
+ * Returns the range which covers the whole of the
+ * document, but excludes any headers and footers.
+ */
+ public abstract Range getRange();
+
+ public abstract TextPieceTable getTextTable();
+
+ public CHPBinTable getCharacterTable()
+ {
+ return _cbt;
+ }
+
+ public PAPBinTable getParagraphTable()
+ {
+ return _pbt;
+ }
+
+ public SectionTable getSectionTable()
+ {
+ return _st;
+ }
+
+ public StyleSheet getStyleSheet()
+ {
+ return _ss;
+ }
+
+ public ListTables getListTables()
+ {
+ return _lt;
+ }
+
+ public FontTable getFontTable()
+ {
+ return _ft;
+ }
+
public FileInformationBlock getFileInformationBlock()
{
return _fib;
import java.io.IOException;
import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.ComplexFileTable;
import org.apache.poi.hwpf.model.OldCHPBinTable;
+import org.apache.poi.hwpf.model.OldPAPBinTable;
+import org.apache.poi.hwpf.model.OldSectionTable;
import org.apache.poi.hwpf.model.PieceDescriptor;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* Provides very simple support for old (Word 6 / Word 95)
* files.
- * TODO Provide a way to get at the properties associated
- * with each block of text
*/
public class HWPFOldDocument extends HWPFDocumentCore {
- private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>();
+ private TextPieceTable tpt;
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot(), fs);
super(directory, fs);
// Where are things?
+ int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
+ int sedTableSize = LittleEndian.getInt(_mainStream, 0x8c);
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
- int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
+ int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
+ int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
+ int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
+ //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
+ //int shfTableSize = LittleEndian.getInt(_mainStream, 0x64);
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
// We need to get hold of the text that makes up the
// document, which might be regular or fast-saved
StringBuffer text = new StringBuffer();
- TextPieceTable tpt;
if(_fib.isFComplex()) {
ComplexFileTable cft = new ComplexFileTable(
_mainStream, _mainStream,
text.append( tp.getStringBuffer() );
}
} else {
+ // TODO Discover if these older documents can ever hold Unicode Strings?
+ // (We think not, because they seem to lack a Piece table)
// TODO Build the Piece Descriptor properly
- // TODO Can these old documents ever contain Unicode strings?
+ // (We have to fake it, as they don't seem to have a proper Piece table)
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
pd.setFilePosition(_fib.getFcMin());
+ // Generate a single Text Piece Table, with a single Text Piece
+ // which covers all the (8 bit only) text in the file
tpt = new TextPieceTable();
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
}
// Now we can fetch the character and paragraph properties
- OldCHPBinTable chpTable = new OldCHPBinTable(
+ _cbt = new OldCHPBinTable(
_mainStream, chpTableOffset, chpTableSize,
_fib.getFcMin(), tpt
);
-
- // Finally build up runs
- for(CHPX chpx : chpTable.getTextRuns()) {
- String str = text.substring(chpx.getStart(), chpx.getEnd());
- contents.add(new TextAndCHPX(str,chpx));
- }
+ _pbt = new OldPAPBinTable(
+ _mainStream, papTableOffset, papTableSize,
+ _fib.getFcMin(), tpt
+ );
+ _st = new OldSectionTable(
+ _mainStream, sedTableOffset, sedTableSize,
+ _fib.getFcMin(), tpt
+ );
+ }
+
+ public Range getRange() {
+ // Life is easy when we have no footers, headers or unicode!
+ return new Range(
+ 0, _fib.getFcMac() - _fib.getFcMin(), this
+ );
+ }
+
+ public TextPieceTable getTextTable()
+ {
+ return tpt;
}
@Override
public void write(OutputStream out) throws IOException {
throw new IllegalStateException("Writing is not available for the older file formats");
}
-
- /**
- * Retrieves all our text, in order, along with the
- * CHPX information on each bit.
- * Every entry has the same formatting, but as yet
- * we've no way to tell what the formatting is...
- * Warnings - this will change as soon as we support
- * text formatting!
- */
- public List<TextAndCHPX> getContents() {
- return contents;
- }
-
- /**
- * Warnings - this will change as soon as we support
- * text formatting!
- */
- public static class TextAndCHPX {
- private String text;
- private CHPX chpx;
- private TextAndCHPX(String text, CHPX chpx) {
- this.text = text;
- this.chpx = chpx;
- }
- public String getText() {
- return text;
- }
- public CHPX getChpx() {
- return chpx;
- }
- }
}
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFOldDocument;
-import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
this.doc = doc;
}
- @Override
+ /**
+ * Get the text from the word file, as an array with one String
+ * per paragraph
+ */
+ public String[] getParagraphText() {
+ String[] ret;
+
+ // Extract using the model code
+ try {
+ Range r = doc.getRange();
+
+ ret = WordExtractor.getParagraphText(r);
+ } catch (Exception e) {
+ // Something's up with turning the text pieces into paragraphs
+ // Fall back to ripping out the text pieces
+ ret = new String[doc.getTextTable().getTextPieces().size()];
+ for(int i=0; i<ret.length; i++) {
+ ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuffer().toString();
+
+ // Fix the line endings
+ ret[i].replaceAll("\r", "\ufffe");
+ ret[i].replaceAll("\ufffe","\r\n");
+ }
+ }
+
+ return ret;
+ }
+
public String getText() {
StringBuffer text = new StringBuffer();
- for(TextAndCHPX tchpx : doc.getContents()) {
- text.append( Range.stripFields(tchpx.getText()) );
+
+ for(String t : getParagraphText()) {
+ text.append(t);
}
+
return text.toString();
}
}
package org.apache.poi.hwpf.extractor;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
-import java.util.Iterator;
-import java.util.Arrays;
import java.util.ArrayList;
+import java.util.Arrays;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
return getParagraphText(r);
}
- private String[] getParagraphText(Range r) {
+ protected static String[] getParagraphText(Range r) {
String[] ret;
ret = new String[r.numParagraphs()];
for (int i = 0; i < ret.length; i++) {
public String getTextFromPieces() {
StringBuffer textBuf = new StringBuffer();
- Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
- while (textPieces.hasNext()) {
- TextPiece piece = (TextPiece) textPieces.next();
-
+ for(TextPiece piece : doc.getTextTable().getTextPieces()) {
String encoding = "Cp1252";
if (piece.isUnicode()) {
encoding = "UTF-16LE";
*
* @author Ryan Ackley
*/
-public final class CHPBinTable
+public class CHPBinTable
{
/** List of character properties.*/
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
package org.apache.poi.hwpf.model;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.LittleEndian;
* In common with the rest of the old support, it
* is read only
*/
-public final class OldCHPBinTable
+public final class OldCHPBinTable extends CHPBinTable
{
- /** List of character properties.*/
- protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
-
/**
* Constructor used to read an old-style binTable
* in from a Word document.
}
}
}
-
- public List<CHPX> getTextRuns()
- {
- return _textRuns;
- }
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the paragraph formatting
+ * properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ * same stream.
+ * In common with the rest of the old support, it
+ * is read only
+ */
+public final class OldPAPBinTable extends PAPBinTable
+{
+ public OldPAPBinTable(byte[] documentStream, int offset,
+ int size, int fcMin, TextPieceTable tpt)
+ {
+ PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
+
+ int length = binTable.length();
+ for (int x = 0; x < length; x++)
+ {
+ GenericPropertyNode node = binTable.getProperty(x);
+
+ int pageNum = LittleEndian.getShort(node.getBytes());
+ int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
+
+ PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
+ documentStream, pageOffset, fcMin, tpt);
+
+ int fkpSize = pfkp.size();
+
+ for (int y = 0; y < fkpSize; y++)
+ {
+ PAPX papx = pfkp.getPAPX(y);
+ _paragraphs.add(papx);
+ }
+ }
+ }
+}
+
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the section formatting
+ * properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ * same stream.
+ * In common with the rest of the old support, it
+ * is read only
+ */
+public final class OldSectionTable extends SectionTable
+{
+ public OldSectionTable(byte[] documentStream, int offset,
+ int size, int fcMin,
+ TextPieceTable tpt)
+ {
+ PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
+
+ int length = sedPlex.length();
+
+ for (int x = 0; x < length; x++)
+ {
+ GenericPropertyNode node = sedPlex.getProperty(x);
+ SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);
+
+ int fileOffset = sed.getFc();
+ int startAt = node.getStart();
+ int endAt = node.getEnd();
+
+ // check for the optimization
+ if (fileOffset == 0xffffffff)
+ {
+ _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
+ }
+ else
+ {
+ // The first short at the offset is the size of the grpprl.
+ int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
+ byte[] buf = new byte[sepxSize];
+ fileOffset += LittleEndian.SHORT_SIZE;
+ System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
+ _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
+ }
+ }
+ }
+}
*
* @author Ryan Ackley
*/
-public final class PAPBinTable
+public class PAPBinTable
{
protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
byte[] _dataStream;
{
return 0;
}
- return LittleEndian.getShort(buf);
+ if (buf.length == 1)
+ {
+ return (short)LittleEndian.getUnsignedByte(buf, 0);
+ }
+ return LittleEndian.getShort(buf);
}
public SprmBuffer getSprmBuf()
public ParagraphProperties getParagraphProperties(StyleSheet ss)
{
+ if(ss == null) {
+ // TODO Fix up for Word 6/95
+ return new ParagraphProperties();
+ }
+
short istd = getIstd();
ParagraphProperties baseStyle = ss.getParagraphStyle(istd);
ParagraphProperties props = ParagraphSprmUncompressor.uncompressPAP(baseStyle, getGrpprl(), 2);
/**
* @author Ryan Ackley
*/
-public final class SectionTable
+public class SectionTable
{
private static final int SED_SIZE = 12;
- protected ArrayList _sections = new ArrayList();
- protected List _text;
+ protected ArrayList<SEPX> _sections = new ArrayList<SEPX>();
+ protected List<TextPiece> _text;
/** So we can know if things are unicode or not */
private TextPieceTable tpt;
boolean matchAt = false;
boolean matchHalf = false;
for(int i=0; i<_sections.size(); i++) {
- SEPX s = (SEPX)_sections.get(i);
+ SEPX s = _sections.get(i);
if(s.getEnd() == mainEndsAt) {
matchAt = true;
} else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) {
if(! matchAt && matchHalf) {
System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
for(int i=0; i<_sections.size(); i++) {
- SEPX s = (SEPX)_sections.get(i);
+ SEPX s = _sections.get(i);
GenericPropertyNode node = sedPlex.getProperty(i);
s.setStart( CPtoFC(node.getStart()) );
public void adjustForInsert(int listIndex, int length)
{
int size = _sections.size();
- SEPX sepx = (SEPX)_sections.get(listIndex);
+ SEPX sepx = _sections.get(listIndex);
sepx.setEnd(sepx.getEnd() + length);
for (int x = listIndex + 1; x < size; x++)
{
- sepx = (SEPX)_sections.get(x);
+ sepx = _sections.get(x);
sepx.setStart(sepx.getStart() + length);
sepx.setEnd(sepx.getEnd() + length);
}
for(int i=_text.size()-1; i>-1; i--)
{
- TP = (TextPiece)_text.get(i);
+ TP = _text.get(i);
if(CP >= TP.getCP()) break;
}
return FC;
}
- public ArrayList getSections()
+ public ArrayList<SEPX> getSections()
{
return _sections;
}
for (int x = 0; x < len; x++)
{
- SEPX sepx = (SEPX)_sections.get(x);
+ SEPX sepx = _sections.get(x);
byte[] grpprl = sepx.getGrpprl();
// write the sepx to the document stream. starts with a 2 byte size
import org.apache.poi.util.LittleEndian;
import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
protected int _end;
/** The document this range blongs to. */
- protected HWPFDocument _doc;
+ protected HWPFDocumentCore _doc;
/** Have we loaded the section indexes yet */
boolean _sectionRangeFound;
* @param doc
* The HWPFDocument the range is based on.
*/
- public Range(int start, int end, HWPFDocument doc) {
+ public Range(int start, int end, HWPFDocumentCore doc) {
_start = start;
_end = end;
_doc = doc;
* The (signed) value that should be added to the FIB CCP fields
*/
protected void adjustFIB(int adjustment) {
+ assert (_doc instanceof HWPFDocument);
+
// update the FIB.CCPText field (this should happen once per adjustment,
// so we don't want it in
// adjustForInsert() or it would get updated multiple times if the range
// without this, OpenOffice.org (v. 2.2.x) does not see all the text in
// the document
- CPSplitCalculator cpS = _doc.getCPSplitCalculator();
+ CPSplitCalculator cpS = ((HWPFDocument)_doc).getCPSplitCalculator();
FileInformationBlock fib = _doc.getFileInformationBlock();
// Do for each affected part
return _end;
}
- protected HWPFDocument getDocument() {
+ protected HWPFDocumentCore getDocument() {
return _doc;
}
assertTrue(text.contains("Paragraph 2"));
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
assertTrue(text.contains("Last (4th) paragraph"));
+
+ String[] tp = w6e.getParagraphText();
+ assertEquals(7, tp.length);
+ assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
+ assertEquals("\r\n", tp[1]);
+ assertEquals("Paragraph 2\r\n", tp[2]);
+ assertEquals("\r\n", tp[3]);
+ assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
+ assertEquals("\r\n", tp[5]);
+ assertEquals("Last (4th) paragraph.\r\n", tp[6]);
}
public void testWord6() throws Exception {
String text = w6e.getText();
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+
+ String[] tp = w6e.getParagraphText();
+ assertEquals(1, tp.length);
+ assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
}
}