/**
* Represents a lightweight node in the Trees used to store content
- * properties.
+ * properties. Works only in characters.
*
* @author Ryan Ackley
*/
public abstract class PropertyNode implements Comparable, Cloneable
{
protected Object _buf;
+ /** The start, in characters */
private int _cpStart;
+ /** The end, in characters */
private int _cpEnd;
/**
- * @param fcStart The start of the text for this property.
- * @param fcEnd The end of the text for this property.
+ * @param fcStart The start of the text for this property, in characters.
+ * @param fcEnd The end of the text for this property, in characters.
* @param buf FIXME: Old documentation is: "grpprl The property description in compressed form."
*/
protected PropertyNode(int fcStart, int fcEnd, Object buf)
_cpStart = fcStart;
_cpEnd = fcEnd;
_buf = buf;
-
}
/**
- * @return The offset of this property's text.
+ * @return The start offset of this property's text.
*/
public int getStart()
{
return 1;
}
}
-
-
-
-
-
}
import java.io.UnsupportedEncodingException;
/**
* Lightweight representation of a text piece.
+ * Works in the character domain, not the byte domain, so you
+ * need to have turned byte references into character
+ * references before getting here.
*
* @author Ryan Ackley
*/
private PieceDescriptor _pd;
- private int _cpStart;
-
/**
- * @param start Offset in main document stream.
+ * @param start Beginning offset in main document stream, in characters.
+ * @param end Ending offset in main document stream, in characters.
+ * @param text The raw bytes of our text
*/
- public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart)
- throws UnsupportedEncodingException
- {
- /** start - end is length on file. This is double the expected when its
- * unicode.*/
- super(start, end, new StringBuffer(new String(text, pd.isUnicode() ? "UTF-16LE" : "Cp1252")));
- _usesUnicode = pd.isUnicode();
- _pd = pd;
- _cpStart = cpStart;
+ public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
+ super(start, end, buildInitSB(text, pd));
+ _usesUnicode = pd.isUnicode();
+ _pd = pd;
+
+ // Validate
+ int textLength = ((StringBuffer)_buf).length();
+ if(end-start != textLength) {
+ throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
+ }
+ if(end < start) {
+ throw new IllegalStateException("Told we're of negative size! start="+start + " end="+end);
+ }
}
+
+ /**
+ * Create the StringBuffer from the text and unicode flag
+ */
+ private static StringBuffer buildInitSB(byte[] text, PieceDescriptor pd) {
+ String str;
+ try {
+ if(pd.isUnicode()) {
+ str = new String(text, "UTF-16LE");
+ } else {
+ str = new String(text, "Cp1252");
+ }
+ } catch(UnsupportedEncodingException e) {
+ throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
+ }
+ return new StringBuffer(str);
+ }
+
/**
* @return If this text piece uses unicode
*/
public byte[] getRawBytes()
{
- try
- {
+ try {
return ((StringBuffer)_buf).toString().getBytes(_usesUnicode ?
"UTF-16LE" : "Cp1252");
+ } catch (UnsupportedEncodingException ignore) {
+ throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
}
- catch (UnsupportedEncodingException ignore)
- {
- // shouldn't ever happen considering we wouldn't have been able to
- // create the StringBuffer w/o getting this exception
- return ((StringBuffer)_buf).toString().getBytes();
- }
-
}
+ /**
+ * Returns part of the string.
+ * Works only in characters, not in bytes!
+ * @param start Local start position, in characters
+ * @param end Local end position, in characters
+ * @return
+ */
public String substring(int start, int end)
{
- int denominator = _usesUnicode ? 2 : 1;
-
- return ((StringBuffer)_buf).substring(start/denominator, end/denominator);
+ StringBuffer buf = (StringBuffer)_buf;
+
+ // Validate
+ if(start < 0) {
+ throw new StringIndexOutOfBoundsException("Can't request a substring before 0 - asked for " + start);
+ }
+ if(end > buf.length()) {
+ throw new StringIndexOutOfBoundsException("Index " + end + " out of range 0 -> " + buf.length());
+ }
+ return buf.substring(start, end);
}
- public void adjustForDelete(int start, int length)
- {
-
- // length is expected to be the number of code-points,
- // not the number of characters
+ /**
+ * Adjusts the internal string for deletinging
+ * some characters within this.
+ * @param start The start position for the delete, in characters
+ * @param length The number of characters to delete
+ */
+ public void adjustForDelete(int start, int length) {
int numChars = length;
- if (usesUnicode()) {
-
- start /= 2;
- numChars = (length / 2);
- }
int myStart = getStart();
int myEnd = getEnd();
super.adjustForDelete(start, length);
}
+ /**
+ * Returns the length, in characters
+ */
public int characterLength()
{
- return (getEnd() - getStart()) / (_usesUnicode ? 2 : 1);
+ return (getEnd() - getStart());
+ }
+ /**
+ * Returns the length, in bytes
+ */
+ public int bytesLength() {
+ return (getEnd() - getStart()) * (_usesUnicode ? 2 : 1);
}
public boolean equals(Object o)
}
+ /**
+ * Returns the character position we start at.
+ */
public int getCP()
{
- return _cpStart;
+ return getStart();
}
-
}
import java.util.List;
/**
+ * The piece table for matching up character positions
+ * to bits of text.
+ * This mostly works in bytes, but the TextPieces
+ * themselves work in characters. This does the icky
+ * convertion.
* @author Ryan Ackley
*/
public class TextPieceTable
//int _multiple;
int _cpMin;
- public TextPieceTable()
- {
+ public TextPieceTable() {
}
public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset,
// get our plex of PieceDescriptors
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, PieceDescriptor.getSizeInBytes());
- //_multiple = 2;
int length = pieceTable.length();
PieceDescriptor[] pieces = new PieceDescriptor[length];
{
GenericPropertyNode node = pieceTable.getProperty(x);
pieces[x] = new PieceDescriptor(node.getBytes(), 0);
-
-// if (!pieces[x].isUnicode())
-// {
-// _multiple = 1;
-// }
}
int firstPieceFilePosition = pieces[0].getFilePosition();
{
int start = pieces[x].getFilePosition();
PropertyNode node = pieceTable.getProperty(x);
- int nodeStart = node.getStart();
-
- // multiple will be 2 if there is only one piece and its unicode. Some
- // type of optimization.
+
+ // Grab the start and end, which are in characters
+ int nodeStartChars = node.getStart();
+ int nodeEndChars = node.getEnd();
+
+ // What's the relationship between bytes and characters?
boolean unicode = pieces[x].isUnicode();
-
int multiple = 1;
- if (unicode)
- {
+ if (unicode) {
multiple = 2;
}
- int nodeEnd = ((node.getEnd() - nodeStart) * multiple) + nodeStart;
- int textSize = nodeEnd - nodeStart;
+
+ // Figure out the length, in bytes and chars
+ int textSizeChars = (nodeEndChars - nodeStartChars);
+ int textSizeBytes = textSizeChars * multiple;
+ // Grab the data that makes up the piece
+ byte[] buf = new byte[textSizeBytes];
+ System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
- byte[] buf = new byte[textSize];
- System.arraycopy(documentStream, start, buf, 0, textSize);
-
- int startFilePosition = start - firstPieceFilePosition;
- _textPieces.add(new TextPiece(startFilePosition, startFilePosition+textSize, buf, pieces[x], node.getStart()));
+ // And now build the piece
+ _textPieces.add(new TextPiece(nodeStartChars, nodeEndChars, buf, pieces[x], node.getStart()));
}
}
//int fcMin = docStream.getOffset();
int size = _textPieces.size();
- int bumpDown = 0;
for (int x = 0; x < size; x++)
{
TextPiece next = (TextPiece)_textPieces.get(x);
// write the text to the docstream and save the piece descriptor to the
// plex which will be written later to the tableStream.
- //if (_multiple == 1 && pd.isUnicode() &&
docStream.write(next.getRawBytes());
+ // The TextPiece is already in characters, which
+ // makes our life much easier
int nodeStart = next.getStart();
- int multiple = 1;
- if (pd.isUnicode())
- {
- multiple = 2;
- }
- textPlex.addProperty(new GenericPropertyNode(nodeStart - bumpDown,
- ((next.getEnd() - nodeStart)/multiple + nodeStart) - bumpDown,
+ int nodeEnd = next.getEnd();
+ textPlex.addProperty(new GenericPropertyNode(nodeStart, nodeEnd,
pd.toByteArray()));
-
- if (pd.isUnicode())
- {
- bumpDown += ((next.getEnd() - nodeStart)/multiple);
- }
-
-
}
return textPlex.toByteArray();
}
-
- public int adjustForInsert(int listIndex, int length)
- {
+ /**
+ * Adjust all the text piece after inserting
+ * some text into one of them
+ * @param listIndex The TextPiece that had characters inserted into
+ * @param length The number of characters inserted
+ */
+ public int adjustForInsert(int listIndex, int length) {
int size = _textPieces.size();
TextPiece tp = (TextPiece)_textPieces.get(listIndex);
-
- //The text piece stores the length on file.
- length = length * (tp.usesUnicode() ? 2 : 1);
+
+ // Update with the new end
tp.setEnd(tp.getEnd() + length);
+
+ // Now change all subsequent ones
for (int x = listIndex + 1; x < size; x++)
{
tp = (TextPiece)_textPieces.get(x);
tp.setStart(tp.getStart() + length);
tp.setEnd(tp.getEnd() + length);
}
+
+ // All done
return length;
}
/**
* Used to construct a Range from a document. This is generally used to
- * create a Range that spans the whole document.
+ * create a Range that spans the whole document, or at least one
+ * whole part of the document (eg main text, header, comment)
*
* @param start Starting character offset of the range.
* @param end Ending character offset of the range.
for (int x = _textStart; x < _textEnd; x++)
{
TextPiece piece = (TextPiece)_text.get(x);
- int start = _start > piece.getStart() ? _start - piece.getStart() : 0;
- int end = _end <= piece.getEnd() ? _end - piece.getStart() : piece.getEnd() - piece.getStart();
-
- if(piece.usesUnicode()) // convert the byte pointers to char pointers
- {
- start/=2;
- end/=2;
+
+ // Figure out where in this piece the text
+ // we're after lives
+ int rStart = 0;
+ int rEnd = piece.characterLength();
+ if(_start > piece.getStart()) {
+ rStart = _start - piece.getStart();
}
- sb.append(piece.getStringBuffer().substring(start, end));
+ if(_end < piece.getEnd()) {
+ rEnd -= (piece.getEnd() - _end);
+ }
+
+ // Luckily TextPieces work in characters, so we don't
+ // need to worry about unicode here
+ sb.append(piece.substring(rStart, rEnd));
}
return sb.toString();
}
}
/**
- * Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
+ * Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
+ *
+ * TODO - handle other kinds of text, eg Headers
*
- * @param adjustment The (signed) value that should be added to <code>FIB.CCPText</code>
+ * @param adjustment The (signed) value that should be added to <code>FIB.CCPText</code>
*/
protected void adjustFIB(int adjustment) {
;
private static final String u_header =
+ "\r\r" +
"This is a simple header, with a \u20ac euro symbol in it.\r"
;
private static final String u_footer =
- "The footer, with Moli\u00e8re, has Unicode in it.\r"
+ "\r\r\r" +
+ "The footer, with Moli\u00e8re, has Unicode in it.\r" +
+ "\r\r\r\r"
;
/**
package org.apache.poi.hwpf.model;
-import junit.framework.*;
-
+import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
-import java.util.ArrayList;
+import java.io.File;
+import java.io.FileInputStream;
+
+import junit.framework.TestCase;
-import org.apache.poi.hwpf.*;
-import org.apache.poi.hwpf.model.io.*;
+import org.apache.poi.hwpf.HWPFDocFixture;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.io.HWPFFileSystem;
-public class TestTextPieceTable
- extends TestCase
-{
+public class TestTextPieceTable extends TestCase {
private HWPFDocFixture _hWPFDocFixture;
+ private String dirname;
public TestTextPieceTable(String name)
{
TextPieceTable newTextPieceTable = newCft.getTextPieceTable();
assertEquals(oldTextPieceTable, newTextPieceTable);
-
-
}
+
+ /**
+ * Check that we do the positions correctly when
+ * working with pure-ascii
+ */
+ public void testAsciiParts() throws Exception {
+ HWPFDocument doc = new HWPFDocument(
+ new FileInputStream(new File(dirname, "ThreeColHeadFoot.doc"))
+ );
+ TextPieceTable tbl = doc.getTextTable();
+
+ // All ascii, so stored in one big lump
+ assertEquals(1, tbl.getTextPieces().size());
+ TextPiece tp = (TextPiece)tbl.getTextPieces().get(0);
+
+ assertEquals(0, tp.getStart());
+ assertEquals(339, tp.getEnd());
+ assertEquals(339, tp.characterLength());
+ assertEquals(339, tp.bytesLength());
+ assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
+
+
+ // Save and re-load
+ HWPFDocument docB = saveAndReload(doc);
+ tbl = docB.getTextTable();
+
+ assertEquals(1, tbl.getTextPieces().size());
+ tp = (TextPiece)tbl.getTextPieces().get(0);
+
+ assertEquals(0, tp.getStart());
+ assertEquals(339, tp.getEnd());
+ assertEquals(339, tp.characterLength());
+ assertEquals(339, tp.bytesLength());
+ assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
+ }
+
+ /**
+ * Check that we do the positions correctly when
+ * working with a mix ascii, unicode file
+ */
+ public void testUnicodeParts() throws Exception {
+ HWPFDocument doc = new HWPFDocument(
+ new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
+ );
+ TextPieceTable tbl = doc.getTextTable();
+
+ // In three bits, split every 512 bytes
+ assertEquals(3, tbl.getTextPieces().size());
+ TextPiece tpA = (TextPiece)tbl.getTextPieces().get(0);
+ TextPiece tpB = (TextPiece)tbl.getTextPieces().get(1);
+ TextPiece tpC = (TextPiece)tbl.getTextPieces().get(2);
+
+ assertTrue(tpA.usesUnicode());
+ assertTrue(tpB.usesUnicode());
+ assertTrue(tpC.usesUnicode());
+
+ assertEquals(256, tpA.characterLength());
+ assertEquals(256, tpB.characterLength());
+ assertEquals(19, tpC.characterLength());
+
+ assertEquals(512, tpA.bytesLength());
+ assertEquals(512, tpB.bytesLength());
+ assertEquals(38, tpC.bytesLength());
+
+ assertEquals(0, tpA.getStart());
+ assertEquals(256, tpA.getEnd());
+ assertEquals(256, tpB.getStart());
+ assertEquals(512, tpB.getEnd());
+ assertEquals(512, tpC.getStart());
+ assertEquals(531, tpC.getEnd());
+
+
+ // Save and re-load
+ HWPFDocument docB = saveAndReload(doc);
+ tbl = docB.getTextTable();
+
+ assertEquals(3, tbl.getTextPieces().size());
+ tpA = (TextPiece)tbl.getTextPieces().get(0);
+ tpB = (TextPiece)tbl.getTextPieces().get(1);
+ tpC = (TextPiece)tbl.getTextPieces().get(2);
+
+ assertTrue(tpA.usesUnicode());
+ assertTrue(tpB.usesUnicode());
+ assertTrue(tpC.usesUnicode());
+
+ assertEquals(256, tpA.characterLength());
+ assertEquals(256, tpB.characterLength());
+ assertEquals(19, tpC.characterLength());
+
+ assertEquals(512, tpA.bytesLength());
+ assertEquals(512, tpB.bytesLength());
+ assertEquals(38, tpC.bytesLength());
+
+ assertEquals(0, tpA.getStart());
+ assertEquals(256, tpA.getEnd());
+ assertEquals(256, tpB.getStart());
+ assertEquals(512, tpB.getEnd());
+ assertEquals(512, tpC.getStart());
+ assertEquals(531, tpC.getEnd());
+ }
+
+ protected HWPFDocument saveAndReload(HWPFDocument doc) throws Exception {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ doc.write(baos);
+
+ return new HWPFDocument(
+ new ByteArrayInputStream(baos.toByteArray())
+ );
+ }
+
protected void setUp()
throws Exception
{
_hWPFDocFixture = new HWPFDocFixture(this);
_hWPFDocFixture.setUp();
+
+ dirname = System.getProperty("HWPF.testdata.path");
}
protected void tearDown()