From c71c0851d58136c41192521d9aab966b19b0c683 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sat, 9 Aug 2008 19:34:38 +0000 Subject: [PATCH] Big big unicode rationalisation in text piece code git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@684319 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/poi/hwpf/model/PropertyNode.java | 16 +-- .../org/apache/poi/hwpf/model/TextPiece.java | 115 ++++++++++----- .../apache/poi/hwpf/model/TextPieceTable.java | 83 ++++++----- .../org/apache/poi/hwpf/usermodel/Range.java | 31 ++-- .../apache/poi/hwpf/TestHWPFRangeParts.java | 5 +- .../poi/hwpf/model/TestTextPieceTable.java | 132 ++++++++++++++++-- 6 files changed, 269 insertions(+), 113 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java index 07cb18ae7a..42c5f5c27e 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java @@ -22,20 +22,22 @@ import java.util.Arrays; /** * Represents a lightweight node in the Trees used to store content - * properties. + * properties. Works only in characters. * * @author Ryan Ackley */ public abstract class PropertyNode implements Comparable, Cloneable { protected Object _buf; + /** The start, in characters */ private int _cpStart; + /** The end, in characters */ private int _cpEnd; /** - * @param fcStart The start of the text for this property. - * @param fcEnd The end of the text for this property. + * @param fcStart The start of the text for this property, in characters. + * @param fcEnd The end of the text for this property, in characters. * @param buf FIXME: Old documentation is: "grpprl The property description in compressed form." */ protected PropertyNode(int fcStart, int fcEnd, Object buf) @@ -43,11 +45,10 @@ public abstract class PropertyNode implements Comparable, Cloneable _cpStart = fcStart; _cpEnd = fcEnd; _buf = buf; - } /** - * @return The offset of this property's text. + * @return The start offset of this property's text. */ public int getStart() { @@ -142,9 +143,4 @@ public abstract class PropertyNode implements Comparable, Cloneable return 1; } } - - - - - } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index 227200ab5d..f326e59b88 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -22,6 +22,9 @@ package org.apache.poi.hwpf.model; import java.io.UnsupportedEncodingException; /** * Lightweight representation of a text piece. + * Works in the character domain, not the byte domain, so you + * need to have turned byte references into character + * references before getting here. * * @author Ryan Ackley */ @@ -32,21 +35,43 @@ public class TextPiece extends PropertyNode implements Comparable private PieceDescriptor _pd; - private int _cpStart; - /** - * @param start Offset in main document stream. + * @param start Beginning offset in main document stream, in characters. + * @param end Ending offset in main document stream, in characters. + * @param text The raw bytes of our text */ - public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) - throws UnsupportedEncodingException - { - /** start - end is length on file. This is double the expected when its - * unicode.*/ - super(start, end, new StringBuffer(new String(text, pd.isUnicode() ? "UTF-16LE" : "Cp1252"))); - _usesUnicode = pd.isUnicode(); - _pd = pd; - _cpStart = cpStart; + public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) { + super(start, end, buildInitSB(text, pd)); + _usesUnicode = pd.isUnicode(); + _pd = pd; + + // Validate + int textLength = ((StringBuffer)_buf).length(); + if(end-start != textLength) { + throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!"); + } + if(end < start) { + throw new IllegalStateException("Told we're of negative size! start="+start + " end="+end); + } } + + /** + * Create the StringBuffer from the text and unicode flag + */ + private static StringBuffer buildInitSB(byte[] text, PieceDescriptor pd) { + String str; + try { + if(pd.isUnicode()) { + str = new String(text, "UTF-16LE"); + } else { + str = new String(text, "Cp1252"); + } + } catch(UnsupportedEncodingException e) { + throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!"); + } + return new StringBuffer(str); + } + /** * @return If this text piece uses unicode */ @@ -67,38 +92,43 @@ public class TextPiece extends PropertyNode implements Comparable public byte[] getRawBytes() { - try - { + try { return ((StringBuffer)_buf).toString().getBytes(_usesUnicode ? "UTF-16LE" : "Cp1252"); + } catch (UnsupportedEncodingException ignore) { + throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!"); } - catch (UnsupportedEncodingException ignore) - { - // shouldn't ever happen considering we wouldn't have been able to - // create the StringBuffer w/o getting this exception - return ((StringBuffer)_buf).toString().getBytes(); - } - } + /** + * Returns part of the string. + * Works only in characters, not in bytes! + * @param start Local start position, in characters + * @param end Local end position, in characters + * @return + */ public String substring(int start, int end) { - int denominator = _usesUnicode ? 2 : 1; - - return ((StringBuffer)_buf).substring(start/denominator, end/denominator); + StringBuffer buf = (StringBuffer)_buf; + + // Validate + if(start < 0) { + throw new StringIndexOutOfBoundsException("Can't request a substring before 0 - asked for " + start); + } + if(end > buf.length()) { + throw new StringIndexOutOfBoundsException("Index " + end + " out of range 0 -> " + buf.length()); + } + return buf.substring(start, end); } - public void adjustForDelete(int start, int length) - { - - // length is expected to be the number of code-points, - // not the number of characters + /** + * Adjusts the internal string for deletinging + * some characters within this. + * @param start The start position for the delete, in characters + * @param length The number of characters to delete + */ + public void adjustForDelete(int start, int length) { int numChars = length; - if (usesUnicode()) { - - start /= 2; - numChars = (length / 2); - } int myStart = getStart(); int myEnd = getEnd(); @@ -121,9 +151,18 @@ public class TextPiece extends PropertyNode implements Comparable super.adjustForDelete(start, length); } + /** + * Returns the length, in characters + */ public int characterLength() { - return (getEnd() - getStart()) / (_usesUnicode ? 2 : 1); + return (getEnd() - getStart()); + } + /** + * Returns the length, in bytes + */ + public int bytesLength() { + return (getEnd() - getStart()) * (_usesUnicode ? 2 : 1); } public boolean equals(Object o) @@ -138,9 +177,11 @@ public class TextPiece extends PropertyNode implements Comparable } + /** + * Returns the character position we start at. + */ public int getCP() { - return _cpStart; + return getStart(); } - } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java index 39f0007b67..5e903ecb8a 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java @@ -28,6 +28,11 @@ import java.util.ArrayList; import java.util.List; /** + * The piece table for matching up character positions + * to bits of text. + * This mostly works in bytes, but the TextPieces + * themselves work in characters. This does the icky + * convertion. * @author Ryan Ackley */ public class TextPieceTable @@ -36,8 +41,7 @@ public class TextPieceTable //int _multiple; int _cpMin; - public TextPieceTable() - { + public TextPieceTable() { } public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset, @@ -47,7 +51,6 @@ public class TextPieceTable // get our plex of PieceDescriptors PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, PieceDescriptor.getSizeInBytes()); - //_multiple = 2; int length = pieceTable.length(); PieceDescriptor[] pieces = new PieceDescriptor[length]; @@ -57,11 +60,6 @@ public class TextPieceTable { GenericPropertyNode node = pieceTable.getProperty(x); pieces[x] = new PieceDescriptor(node.getBytes(), 0); - -// if (!pieces[x].isUnicode()) -// { -// _multiple = 1; -// } } int firstPieceFilePosition = pieces[0].getFilePosition(); @@ -72,26 +70,28 @@ public class TextPieceTable { int start = pieces[x].getFilePosition(); PropertyNode node = pieceTable.getProperty(x); - int nodeStart = node.getStart(); - - // multiple will be 2 if there is only one piece and its unicode. Some - // type of optimization. + + // Grab the start and end, which are in characters + int nodeStartChars = node.getStart(); + int nodeEndChars = node.getEnd(); + + // What's the relationship between bytes and characters? boolean unicode = pieces[x].isUnicode(); - int multiple = 1; - if (unicode) - { + if (unicode) { multiple = 2; } - int nodeEnd = ((node.getEnd() - nodeStart) * multiple) + nodeStart; - int textSize = nodeEnd - nodeStart; + + // Figure out the length, in bytes and chars + int textSizeChars = (nodeEndChars - nodeStartChars); + int textSizeBytes = textSizeChars * multiple; + // Grab the data that makes up the piece + byte[] buf = new byte[textSizeBytes]; + System.arraycopy(documentStream, start, buf, 0, textSizeBytes); - byte[] buf = new byte[textSize]; - System.arraycopy(documentStream, start, buf, 0, textSize); - - int startFilePosition = start - firstPieceFilePosition; - _textPieces.add(new TextPiece(startFilePosition, startFilePosition+textSize, buf, pieces[x], node.getStart())); + // And now build the piece + _textPieces.add(new TextPiece(nodeStartChars, nodeEndChars, buf, pieces[x], node.getStart())); } } @@ -113,7 +113,6 @@ public class TextPieceTable //int fcMin = docStream.getOffset(); int size = _textPieces.size(); - int bumpDown = 0; for (int x = 0; x < size; x++) { TextPiece next = (TextPiece)_textPieces.get(x); @@ -134,47 +133,43 @@ public class TextPieceTable // write the text to the docstream and save the piece descriptor to the // plex which will be written later to the tableStream. - //if (_multiple == 1 && pd.isUnicode() && docStream.write(next.getRawBytes()); + // The TextPiece is already in characters, which + // makes our life much easier int nodeStart = next.getStart(); - int multiple = 1; - if (pd.isUnicode()) - { - multiple = 2; - } - textPlex.addProperty(new GenericPropertyNode(nodeStart - bumpDown, - ((next.getEnd() - nodeStart)/multiple + nodeStart) - bumpDown, + int nodeEnd = next.getEnd(); + textPlex.addProperty(new GenericPropertyNode(nodeStart, nodeEnd, pd.toByteArray())); - - if (pd.isUnicode()) - { - bumpDown += ((next.getEnd() - nodeStart)/multiple); - } - - } return textPlex.toByteArray(); } - - public int adjustForInsert(int listIndex, int length) - { + /** + * Adjust all the text piece after inserting + * some text into one of them + * @param listIndex The TextPiece that had characters inserted into + * @param length The number of characters inserted + */ + public int adjustForInsert(int listIndex, int length) { int size = _textPieces.size(); TextPiece tp = (TextPiece)_textPieces.get(listIndex); - - //The text piece stores the length on file. - length = length * (tp.usesUnicode() ? 2 : 1); + + // Update with the new end tp.setEnd(tp.getEnd() + length); + + // Now change all subsequent ones for (int x = listIndex + 1; x < size; x++) { tp = (TextPiece)_textPieces.get(x); tp.setStart(tp.getStart() + length); tp.setEnd(tp.getEnd() + length); } + + // All done return length; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java index 0ef944f136..9838f9ef21 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java @@ -137,7 +137,8 @@ public class Range /** * Used to construct a Range from a document. This is generally used to - * create a Range that spans the whole document. + * create a Range that spans the whole document, or at least one + * whole part of the document (eg main text, header, comment) * * @param start Starting character offset of the range. * @param end Ending character offset of the range. @@ -259,15 +260,21 @@ public class Range for (int x = _textStart; x < _textEnd; x++) { TextPiece piece = (TextPiece)_text.get(x); - int start = _start > piece.getStart() ? _start - piece.getStart() : 0; - int end = _end <= piece.getEnd() ? _end - piece.getStart() : piece.getEnd() - piece.getStart(); - - if(piece.usesUnicode()) // convert the byte pointers to char pointers - { - start/=2; - end/=2; + + // Figure out where in this piece the text + // we're after lives + int rStart = 0; + int rEnd = piece.characterLength(); + if(_start > piece.getStart()) { + rStart = _start - piece.getStart(); } - sb.append(piece.getStringBuffer().substring(start, end)); + if(_end < piece.getEnd()) { + rEnd -= (piece.getEnd() - _end); + } + + // Luckily TextPieces work in characters, so we don't + // need to worry about unicode here + sb.append(piece.substring(rStart, rEnd)); } return sb.toString(); } @@ -929,9 +936,11 @@ public class Range } /** - * Adjust the value of FIB.CCPText after an insert or a delete... + * Adjust the value of FIB.CCPText after an insert or a delete... + * + * TODO - handle other kinds of text, eg Headers * - * @param adjustment The (signed) value that should be added to FIB.CCPText + * @param adjustment The (signed) value that should be added to FIB.CCPText */ protected void adjustFIB(int adjustment) { diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java b/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java index c0aa62569a..af0faf96c9 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java @@ -78,10 +78,13 @@ public class TestHWPFRangeParts extends TestCase { ; private static final String u_header = + "\r\r" + "This is a simple header, with a \u20ac euro symbol in it.\r" ; private static final String u_footer = - "The footer, with Moli\u00e8re, has Unicode in it.\r" + "\r\r\r" + + "The footer, with Moli\u00e8re, has Unicode in it.\r" + + "\r\r\r\r" ; /** diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java index 30e9ee138f..eef4f0d5eb 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java @@ -18,19 +18,21 @@ package org.apache.poi.hwpf.model; -import junit.framework.*; - +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; -import java.util.ArrayList; +import java.io.File; +import java.io.FileInputStream; + +import junit.framework.TestCase; -import org.apache.poi.hwpf.*; -import org.apache.poi.hwpf.model.io.*; +import org.apache.poi.hwpf.HWPFDocFixture; +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.io.HWPFFileSystem; -public class TestTextPieceTable - extends TestCase -{ +public class TestTextPieceTable extends TestCase { private HWPFDocFixture _hWPFDocFixture; + private String dirname; public TestTextPieceTable(String name) { @@ -63,9 +65,117 @@ public class TestTextPieceTable TextPieceTable newTextPieceTable = newCft.getTextPieceTable(); assertEquals(oldTextPieceTable, newTextPieceTable); - - } + + /** + * Check that we do the positions correctly when + * working with pure-ascii + */ + public void testAsciiParts() throws Exception { + HWPFDocument doc = new HWPFDocument( + new FileInputStream(new File(dirname, "ThreeColHeadFoot.doc")) + ); + TextPieceTable tbl = doc.getTextTable(); + + // All ascii, so stored in one big lump + assertEquals(1, tbl.getTextPieces().size()); + TextPiece tp = (TextPiece)tbl.getTextPieces().get(0); + + assertEquals(0, tp.getStart()); + assertEquals(339, tp.getEnd()); + assertEquals(339, tp.characterLength()); + assertEquals(339, tp.bytesLength()); + assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document")); + + + // Save and re-load + HWPFDocument docB = saveAndReload(doc); + tbl = docB.getTextTable(); + + assertEquals(1, tbl.getTextPieces().size()); + tp = (TextPiece)tbl.getTextPieces().get(0); + + assertEquals(0, tp.getStart()); + assertEquals(339, tp.getEnd()); + assertEquals(339, tp.characterLength()); + assertEquals(339, tp.bytesLength()); + assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document")); + } + + /** + * Check that we do the positions correctly when + * working with a mix ascii, unicode file + */ + public void testUnicodeParts() throws Exception { + HWPFDocument doc = new HWPFDocument( + new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc")) + ); + TextPieceTable tbl = doc.getTextTable(); + + // In three bits, split every 512 bytes + assertEquals(3, tbl.getTextPieces().size()); + TextPiece tpA = (TextPiece)tbl.getTextPieces().get(0); + TextPiece tpB = (TextPiece)tbl.getTextPieces().get(1); + TextPiece tpC = (TextPiece)tbl.getTextPieces().get(2); + + assertTrue(tpA.usesUnicode()); + assertTrue(tpB.usesUnicode()); + assertTrue(tpC.usesUnicode()); + + assertEquals(256, tpA.characterLength()); + assertEquals(256, tpB.characterLength()); + assertEquals(19, tpC.characterLength()); + + assertEquals(512, tpA.bytesLength()); + assertEquals(512, tpB.bytesLength()); + assertEquals(38, tpC.bytesLength()); + + assertEquals(0, tpA.getStart()); + assertEquals(256, tpA.getEnd()); + assertEquals(256, tpB.getStart()); + assertEquals(512, tpB.getEnd()); + assertEquals(512, tpC.getStart()); + assertEquals(531, tpC.getEnd()); + + + // Save and re-load + HWPFDocument docB = saveAndReload(doc); + tbl = docB.getTextTable(); + + assertEquals(3, tbl.getTextPieces().size()); + tpA = (TextPiece)tbl.getTextPieces().get(0); + tpB = (TextPiece)tbl.getTextPieces().get(1); + tpC = (TextPiece)tbl.getTextPieces().get(2); + + assertTrue(tpA.usesUnicode()); + assertTrue(tpB.usesUnicode()); + assertTrue(tpC.usesUnicode()); + + assertEquals(256, tpA.characterLength()); + assertEquals(256, tpB.characterLength()); + assertEquals(19, tpC.characterLength()); + + assertEquals(512, tpA.bytesLength()); + assertEquals(512, tpB.bytesLength()); + assertEquals(38, tpC.bytesLength()); + + assertEquals(0, tpA.getStart()); + assertEquals(256, tpA.getEnd()); + assertEquals(256, tpB.getStart()); + assertEquals(512, tpB.getEnd()); + assertEquals(512, tpC.getStart()); + assertEquals(531, tpC.getEnd()); + } + + protected HWPFDocument saveAndReload(HWPFDocument doc) throws Exception { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + doc.write(baos); + + return new HWPFDocument( + new ByteArrayInputStream(baos.toByteArray()) + ); + } + protected void setUp() throws Exception { @@ -73,6 +183,8 @@ public class TestTextPieceTable _hWPFDocFixture = new HWPFDocFixture(this); _hWPFDocFixture.setUp(); + + dirname = System.getProperty("HWPF.testdata.path"); } protected void tearDown() -- 2.39.5