<!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
<action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
<action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
<action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
<action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
<action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
<action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
// read in the pictures stream
_pictures = new PicturesTable(this, _dataStream, _mainStream, _fspa, _dgg);
- _st = new SectionTable(_mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, getTextTable().getTextPieces());
+ _st = new SectionTable(_mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, _tpt, _cpSplit);
_ss = new StyleSheet(_tableStream, _fib.getFcStshf());
_ft = new FontTable(_tableStream, _fib.getFcSttbfffn(), _fib.getLcbSttbfffn());
generateCp(fcEnd, isUnicode),
buf
);
+ this.isUnicode = isUnicode;
}
private static int generateCp(int val, boolean isUnicode) {
if(isUnicode)
public void insert(int listIndex, int cpStart, SprmBuffer buf)
{
- boolean needsToBeUnicode = tpt.isUnicodeAt(cpStart);
+ boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode);
for (int x = 0; x < _crun; x++)
{
- boolean isUnicode = tpt.isUnicodeAt( getStart(x) );
+ boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) );
_chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode));
}
}
public void insert(int listIndex, int cpStart, SprmBuffer buf)
{
- boolean needsToBeUnicode = tpt.isUnicodeAt(cpStart);
+ boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode);
for (int x = 0; x < _crun; x++) {
int startAt = getStart(x) - fcMin;
int endAt = getEnd(x) - fcMin;
- boolean isUnicode = tpt.isUnicodeAt(startAt);
+ boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt);
+ //System.err.println(startAt + " -> " + endAt + " = " + isUnicode);
_papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode));
}
_cpStart = fcStart;
_cpEnd = fcEnd;
_buf = buf;
+
+ if(_cpStart < 0) {
+ System.err.println("A property claimed to start before zero, at " + _cpStart + "! Resetting it to zero, and hoping for the best");
+ _cpStart = 0;
+ }
}
/**
import org.apache.poi.hwpf.usermodel.SectionProperties;
/**
- * TODO - figure out if this works in characters, like most
- * things do, or in bytes as PAPX / CHPX does.
*/
-public class SEPX extends PropertyNode
+public class SEPX extends BytePropertyNode
{
SectionDescriptor _sed;
- public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl)
+ public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode)
{
- super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0));
+ super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode);
_sed = sed;
}
protected ArrayList _sections = new ArrayList();
protected List _text;
+ /** So we can know if things are unicode or not */
+ private TextPieceTable tpt;
+
public SectionTable()
{
}
public SectionTable(byte[] documentStream, byte[] tableStream, int offset,
int size, int fcMin,
- List tpt)
+ TextPieceTable tpt, CPSplitCalculator cps)
{
PlexOfCps sedPlex = new PlexOfCps(tableStream, offset, size, SED_SIZE);
- _text = tpt;
+ this.tpt = tpt;
+ this._text = tpt.getTextPieces();
int length = sedPlex.length();
SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);
int fileOffset = sed.getFc();
+ int startAt = CPtoFC(node.getStart());
+ int endAt = CPtoFC(node.getEnd());
+
+ boolean isUnicodeAtStart = tpt.isUnicodeAtByteOffset( startAt );
+// System.err.println(startAt + " -> " + endAt + " = " + isUnicodeAtStart);
// check for the optimization
if (fileOffset == 0xffffffff)
{
- _sections.add(new SEPX(sed, CPtoFC(node.getStart()), CPtoFC(node.getEnd()), new byte[0]));
+ _sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart));
}
else
{
byte[] buf = new byte[sepxSize];
fileOffset += LittleEndian.SHORT_SIZE;
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
- _sections.add(new SEPX(sed, CPtoFC(node.getStart()), CPtoFC(node.getEnd()), buf));
+ _sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart));
}
}
+
+ // Some files seem to lie about their unicode status, which
+ // is very very pesky. Try to work around these, but this
+ // is getting on for black magic...
+ int mainEndsAt = cps.getMainDocumentEnd();
+ boolean matchAt = false;
+ boolean matchHalf = false;
+ for(int i=0; i<_sections.size(); i++) {
+ SEPX s = (SEPX)_sections.get(i);
+ if(s.getEnd() == mainEndsAt) {
+ matchAt = true;
+ } else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) {
+ matchHalf = true;
+ }
+ }
+ if(! matchAt && matchHalf) {
+ System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
+ for(int i=0; i<_sections.size(); i++) {
+ SEPX s = (SEPX)_sections.get(i);
+ GenericPropertyNode node = sedPlex.getProperty(i);
+
+ s.setStart( CPtoFC(node.getStart()) );
+ s.setEnd( CPtoFC(node.getEnd()) );
+ }
+ }
}
public void adjustForInsert(int listIndex, int length)
// Line using Ryan's FCtoCP() conversion method -
// unable to observe any effect on our testcases when using this code - piers
- GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStart()), FCtoCP(sepx.getEnd()), sed.toByteArray());
+ GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStartBytes()), FCtoCP(sepx.getEndBytes()), sed.toByteArray());
plex.addProperty(property);
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
// And now build the piece
_textPieces.add(new TextPiece(nodeStartChars, nodeEndChars, buf, pieces[x], node.getStart()));
}
+
+ // In the interest of our sanity, now sort the text pieces
+ // into order, if they're not already
+ TextPiece[] tp = (TextPiece[])
+ _textPieces.toArray(new TextPiece[_textPieces.size()]);
+ Arrays.sort(tp);
+ for(int i=0; i<tp.length; i++) {
+ _textPieces.set(i, tp[i]);
+ }
}
public int getCpMin()
* paragraph properties :(
* @param cp The character offset to check about
*/
- public boolean isUnicodeAt(int cp) {
+ public boolean isUnicodeAtCharOffset(int cp) {
boolean lastWas = false;
- int lastAt = 0;
Iterator it = _textPieces.iterator();
while(it.hasNext()) {
return tp.isUnicode();
}
// Otherwise keep track for the last one
- if(tp.getStart() > lastAt) {
- lastWas = tp.isUnicode();
+ lastWas = tp.isUnicode();
+ }
+
+ // If they ask off the end, just go with the last one...
+ return lastWas;
+ }
+ /**
+ * Is the text at the given byte offset
+ * unicode, or plain old ascii?
+ * In a very evil fashion, you have to actually
+ * know this to make sense of character and
+ * paragraph properties :(
+ * @param cp The character offset to check about
+ */
+ public boolean isUnicodeAtByteOffset(int bytePos) {
+ boolean lastWas = false;
+ int curByte = 0;
+
+ Iterator it = _textPieces.iterator();
+ while(it.hasNext()) {
+ TextPiece tp = (TextPiece)it.next();
+ int nextByte = curByte + tp.bytesLength();
+
+ // If the text piece covers the character, all good
+ if(curByte <= bytePos && nextByte >= bytePos) {
+ return tp.isUnicode();
}
+ // Otherwise keep track for the last one
+ lastWas = tp.isUnicode();
+ // Move along
+ curByte = nextByte;
}
// If they ask off the end, just go with the last one...
_characters = _doc.getCharacterTable().getTextRuns();
_text = _doc.getTextTable().getTextPieces();
_parent = new WeakReference(null);
+
+ sanityCheckStartEnd();
}
_characters = parent._characters;
_text = parent._text;
_parent = new WeakReference(parent);
+
+ sanityCheckStartEnd();
}
/**
_textRangeFound = true;
break;
}
+
+ sanityCheckStartEnd();
+ }
+
+ /**
+ * Ensures that the start and end were were given
+ * are actually valid, to avoid issues later on
+ * if they're not
+ */
+ private void sanityCheckStartEnd() {
+ if(_start < 0) {
+ throw new IllegalArgumentException("Range start must not be negative. Given " + _start);
+ }
+ if(_end < _start) {
+ throw new IllegalArgumentException("The end (" + _end + ") must not be before the start ("+_start+")");
+ }
}
/**
for (int x = _parStart; x < numParagraphs; x++)
{
PAPX papx = (PAPX)_paragraphs.get(x);
+ //System.err.println("Paragraph " + x + " was " + papx.getStart() + " -> " + papx.getEnd());
papx.adjustForDelete(_start, _end - _start);
+ //System.err.println("Paragraph " + x + " is now " + papx.getStart() + " -> " + papx.getEnd());
}
for (int x = _sectionStart; x < numSections; x++)
{
SEPX sepx = (SEPX)_sections.get(x);
+ //System.err.println("Section " + x + " was " + sepx.getStart() + " -> " + sepx.getEnd());
sepx.adjustForDelete(_start, _end - _start);
+ //System.err.println("Section " + x + " is now " + sepx.getStart() + " -> " + sepx.getEnd());
}
for (int x = _textStart; x < numTextPieces; x++)
{
throw new ArrayIndexOutOfBoundsException("The table's bounds fall outside of this Range");
}
+ if (tableEnd < 0)
+ {
+ throw new ArrayIndexOutOfBoundsException("The table's end is negative, which isn't allowed!");
+ }
return new Table(r._parStart, tableEnd, r._doc.getRange(), paragraph.getTableLevel());
}
byte[] tableStream = _hWPFDocFixture._tableStream;
int fcMin = fib.getFcMin();
+ CPSplitCalculator cps = new CPSplitCalculator(fib);
+
ComplexFileTable cft = new ComplexFileTable(mainStream, tableStream, fib.getFcClx(), fcMin);
TextPieceTable tpt = cft.getTextPieceTable();
SectionTable sectionTable = new SectionTable(mainStream, tableStream,
fib.getFcPlcfsed(),
fib.getLcbPlcfsed(),
- fcMin, tpt.getTextPieces());
+ fcMin, tpt, cps);
HWPFFileSystem fileSys = new HWPFFileSystem();
sectionTable.writeTo(fileSys, 0);
byte[] newTableStream = tableOut.toByteArray();
byte[] newMainStream = mainOut.toByteArray();
- SectionTable newSectionTable = new SectionTable(newMainStream, newTableStream, 0, newTableStream.length, 0, tpt.getTextPieces());
+ SectionTable newSectionTable = new SectionTable(
+ newMainStream, newTableStream, 0,
+ newTableStream.length, 0, tpt, cps);
ArrayList oldSections = sectionTable.getSections();
ArrayList newSections = newSectionTable.getSections();
HWPFDocument doc = new HWPFDocument(new FileInputStream(
new File(dirname, "Bug44292.doc")));
Range r = doc.getRange();
+ assertEquals(6, r.numParagraphs());
+ assertEquals(0, r.getStartOffset());
+ assertEquals(87, r.getEndOffset());
- //get the table
+ // Paragraph with table
Paragraph p = r.getParagraph(0);
+ assertEquals(0, p.getStartOffset());
+ assertEquals(20, p.getEndOffset());
+
+ // Get the table
Table t = r.getTable(p);
//get the only row
import junit.framework.TestCase;
import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.PAPX;
/**
* Test to see if Range.delete() works even if the Range contains a
"${delete} This is an MS-Word 97 formatted document created using NeoOffice v. 2.2.4 Patch 0 (OpenOffice.org v. 2.2.1).\r";
private String originalText =
"It is used to confirm that text delete works even if Unicode characters (such as \u201c\u2014\u201d (U+2014), \u201c\u2e8e\u201d (U+2E8E), or \u201c\u2714\u201d (U+2714)) are present. Everybody should be thankful to the ${organization} ${delete} and all the POI contributors for their assistance in this matter.\r";
+ private String lastText =
+ "Thank you, ${organization} ${delete}!\r";
private String searchText = "${delete}";
private String expectedText1 = " This is an MS-Word 97 formatted document created using NeoOffice v. 2.2.4 Patch 0 (OpenOffice.org v. 2.2.1).\r";
private String expectedText2 =
Range range;
Section section;
Paragraph para;
+ PAPX paraDef;
// First, check overall
range = daDoc.getOverallRange();
assertEquals(1, range.numSections());
- assertEquals(4, range.numParagraphs());
+ assertEquals(5, range.numParagraphs());
// Now, onto just the doc bit
range = daDoc.getRange();
assertEquals(1, range.numSections());
+ assertEquals(1, daDoc.getSectionTable().getSections().size());
section = range.getSection(0);
-
- assertEquals(4, section.numParagraphs());
+
+ assertEquals(5, section.numParagraphs());
para = section.getParagraph(0);
assertEquals(1, para.numCharacterRuns());
assertEquals(introText, para.text());
para = section.getParagraph(1);
- assertEquals(2, para.numCharacterRuns());
+ assertEquals(5, para.numCharacterRuns());
assertEquals(fillerText, para.text());
+
+ paraDef = (PAPX)daDoc.getParagraphTable().getParagraphs().get(2);
+ assertEquals(132, paraDef.getStart());
+ assertEquals(400, paraDef.getEnd());
+
para = section.getParagraph(2);
- assertEquals(6, para.numCharacterRuns());
+ assertEquals(5, para.numCharacterRuns());
assertEquals(originalText, para.text());
+
+
+ paraDef = (PAPX)daDoc.getParagraphTable().getParagraphs().get(3);
+ assertEquals(400, paraDef.getStart());
+ assertEquals(438, paraDef.getEnd());
+
+ para = section.getParagraph(3);
+ assertEquals(1, para.numCharacterRuns());
+ assertEquals(lastText, para.text());
+
+
+ // Check things match on text length
+ assertEquals(439, range.text().length());
+ assertEquals(439, section.text().length());
+ assertEquals(439,
+ section.getParagraph(0).text().length() +
+ section.getParagraph(1).text().length() +
+ section.getParagraph(2).text().length() +
+ section.getParagraph(3).text().length() +
+ section.getParagraph(4).text().length()
+ );
}
/**
assertEquals(1, range.numSections());
Section section = range.getSection(0);
- assertEquals(4, section.numParagraphs());
+ assertEquals(5, section.numParagraphs());
Paragraph para = section.getParagraph(2);
assertEquals(1, range.numSections());
section = range.getSection(0);
- assertEquals(4, section.numParagraphs());
+ assertEquals(5, section.numParagraphs());
para = section.getParagraph(2);
text = para.text();
assertEquals(1, range.numSections());
Section section = range.getSection(0);
- assertEquals(4, section.numParagraphs());
+ assertEquals(5, section.numParagraphs());
Paragraph para = section.getParagraph(2);
assertEquals(1, range.numSections());
section = range.getSection(0);
- assertEquals(4, section.numParagraphs());
+ assertEquals(5, section.numParagraphs());
para = section.getParagraph(0);
text = para.text();
Paragraph para = section.getParagraph(2);
assertEquals(originalText, para.text());
- assertEquals(6, para.numCharacterRuns());
+ assertEquals(3, para.numCharacterRuns());
String text =
para.getCharacterRun(0).text() +
para.getCharacterRun(1).text() +
- para.getCharacterRun(2).text() +
- para.getCharacterRun(3).text() +
- para.getCharacterRun(4).text() +
- para.getCharacterRun(5).text()
+ para.getCharacterRun(2).text()
;
assertEquals(originalText, text);
Paragraph para = section.getParagraph(2);
assertEquals((textToInsert + originalText), para.text());
- assertEquals(6, para.numCharacterRuns());
+ assertEquals(3, para.numCharacterRuns());
String text =
para.getCharacterRun(0).text() +
para.getCharacterRun(1).text() +
- para.getCharacterRun(2).text() +
- para.getCharacterRun(3).text() +
- para.getCharacterRun(4).text() +
- para.getCharacterRun(5).text()
+ para.getCharacterRun(2).text()
;
// System.out.println(text);
r.text()
);
+ assertEquals(1, r.numSections());
+ assertEquals(1, a.getSectionTable().getSections().size());
+ Section s = r.getSection(0);
+ assertEquals(
+ a_page_1 +
+ page_break + "\r" +
+ a_page_2,
+ s.text()
+ );
+
assertEquals(
7,
r.numParagraphs()
assertEquals(
408, r.text().length()
);
+
+
+ assertEquals(1, r.numSections());
+ assertEquals(1, u.getSectionTable().getSections().size());
+ Section s = r.getSection(0);
+ assertEquals(
+ u_page_1 +
+ page_break + "\r" +
+ u_page_2,
+ s.text()
+ );
+ assertEquals(0, s.getStartOffset());
+ assertEquals(408, s.getEndOffset());
+
List pDefs = r._paragraphs;
assertEquals(35, pDefs.size());
HWPFDocument daDoc = new HWPFDocument(new FileInputStream(illustrativeDocFile));
Range range = daDoc.getRange();
+ assertEquals(414, range.text().length());
assertEquals(1, range.numSections());
Section section = range.getSection(0);
+ assertEquals(414, section.text().length());
- assertEquals(4, section.numParagraphs());
+ assertEquals(5, section.numParagraphs());
Paragraph para = section.getParagraph(2);
- assertEquals(6, para.numCharacterRuns());
+ assertEquals(5, para.numCharacterRuns());
String text =
para.getCharacterRun(0).text() +
para.getCharacterRun(1).text() +
para.getCharacterRun(2).text() +
para.getCharacterRun(3).text() +
- para.getCharacterRun(4).text() +
- para.getCharacterRun(5).text()
+ para.getCharacterRun(4).text()
;
assertEquals(originalText, text);
assertEquals(1, range.numSections());
Section section = range.getSection(0);
- assertEquals(4, section.numParagraphs());
+ assertEquals(5, section.numParagraphs());
Paragraph para = section.getParagraph(2);
assertEquals(1, range.numSections());
Section section = range.getSection(0);
- assertEquals(4, section.numParagraphs());
+ assertEquals(5, section.numParagraphs());
Paragraph para = section.getParagraph(2);
assertEquals(1, range.numSections());
section = range.getSection(0);
- assertEquals(4, section.numParagraphs());
+ assertEquals(5, section.numParagraphs());
para = section.getParagraph(2);
text = para.text();