Big big unicode rationalisation in text piece code

author Nick Burch <nick@apache.org>

Sat, 9 Aug 2008 19:34:38 +0000 (19:34 +0000)

committer Nick Burch <nick@apache.org>

Sat, 9 Aug 2008 19:34:38 +0000 (19:34 +0000)
author Nick Burch <nick@apache.org>
Sat, 9 Aug 2008 19:34:38 +0000 (19:34 +0000)
committer Nick Burch <nick@apache.org>
Sat, 9 Aug 2008 19:34:38 +0000 (19:34 +0000)
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java

index 07cb18ae7a65b1286876f8b0c6adaf1d703c7415..42c5f5c27eb02ade48c9506105365527e7179450 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java
@@ -22,20 +22,22 @@ import java.util.Arrays;
  
  /**
   * Represents a lightweight node in the Trees used to store content
- * properties.
+ * properties. Works only in characters.
   *
   * @author Ryan Ackley
   */
  public abstract class PropertyNode implements Comparable, Cloneable
  {
    protected Object _buf;
+  /** The start, in characters */
    private int _cpStart;
+  /** The end, in characters */
    private int _cpEnd;
  
  
    /**
-   * @param fcStart The start of the text for this property.
-   * @param fcEnd The end of the text for this property.
+   * @param fcStart The start of the text for this property, in characters.
+   * @param fcEnd The end of the text for this property, in characters.
     * @param buf FIXME: Old documentation is: "grpprl The property description in compressed form."
     */
    protected PropertyNode(int fcStart, int fcEnd, Object buf)
@@ -43,11 +45,10 @@ public abstract class PropertyNode implements Comparable, Cloneable
        _cpStart = fcStart;
        _cpEnd = fcEnd;
        _buf = buf;
-
    }
  
    /**
-   * @return The offset of this property's text.
+   * @return The start offset of this property's text.
     */
    public int getStart()
    {
@@ -142,9 +143,4 @@ public abstract class PropertyNode implements Comparable, Cloneable
          return 1;
        }
    }
-
-
-
-
-
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java

index 227200ab5d9b811a89ed8a4257a0fe9547c2a1fb..f326e59b88001be5237ed1b069216f61cbb4a863 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
@@ -22,6 +22,9 @@ package org.apache.poi.hwpf.model;
  import java.io.UnsupportedEncodingException;
  /**
   * Lightweight representation of a text piece.
+ * Works in the character domain, not the byte domain, so you
+ *  need to have turned byte references into character
+ *  references before getting here.
   *
   * @author Ryan Ackley
   */
@@ -32,21 +35,43 @@ public class TextPiece extends PropertyNode implements Comparable
  
    private PieceDescriptor _pd;
  
-  private int _cpStart;
-
    /**
-   * @param start Offset in main document stream.
+   * @param start Beginning offset in main document stream, in characters.
+   * @param end Ending offset in main document stream, in characters.
+   * @param text The raw bytes of our text
     */
-  public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart)
-    throws UnsupportedEncodingException
-  {
-     /** start - end is length on file. This is double the expected when its
-     * unicode.*/
-    super(start, end, new StringBuffer(new String(text, pd.isUnicode() ? "UTF-16LE" : "Cp1252")));
-    _usesUnicode = pd.isUnicode();
-    _pd = pd;
-    _cpStart = cpStart;
+  public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
+         super(start, end, buildInitSB(text, pd));
+         _usesUnicode = pd.isUnicode();
+         _pd = pd;
+         
+         // Validate
+         int textLength = ((StringBuffer)_buf).length();
+         if(end-start != textLength) {
+                 throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
+         }
+         if(end < start) {
+                 throw new IllegalStateException("Told we're of negative size! start="+start + " end="+end);
+         }
    }
+  
+  /**
+   * Create the StringBuffer from the text and unicode flag
+   */
+  private static StringBuffer buildInitSB(byte[] text, PieceDescriptor pd) {
+         String str;
+         try {
+                 if(pd.isUnicode()) {
+                         str = new String(text, "UTF-16LE");
+                 } else {
+                         str = new String(text, "Cp1252");
+                 }
+         } catch(UnsupportedEncodingException e) {
+                 throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
+         }
+         return new StringBuffer(str);
+  }
+  
    /**
     * @return If this text piece uses unicode
     */
@@ -67,38 +92,43 @@ public class TextPiece extends PropertyNode implements Comparable
  
     public byte[] getRawBytes()
     {
-     try
-     {
+     try {
         return ((StringBuffer)_buf).toString().getBytes(_usesUnicode ?
             "UTF-16LE" : "Cp1252");
+     } catch (UnsupportedEncodingException ignore) {
+                 throw new RuntimeException("Your Java is broken! It doesn't know about basic, required character encodings!");
       }
-     catch (UnsupportedEncodingException ignore)
-     {
-       // shouldn't ever happen considering we wouldn't have been able to
-       // create the StringBuffer w/o getting this exception
-       return ((StringBuffer)_buf).toString().getBytes();
-     }
-
     }
  
+   /**
+    * Returns part of the string. 
+    * Works only in characters, not in bytes!
+    * @param start Local start position, in characters
+    * @param end Local end position, in characters
+    * @return
+    */
     public String substring(int start, int end)
     {
-     int denominator = _usesUnicode ? 2 : 1;
-
-     return ((StringBuffer)_buf).substring(start/denominator, end/denominator);
+          StringBuffer buf = (StringBuffer)_buf;
+          
+          // Validate
+          if(start < 0) {
+                  throw new StringIndexOutOfBoundsException("Can't request a substring before 0 - asked for " + start);
+          }
+          if(end > buf.length()) {
+                  throw new StringIndexOutOfBoundsException("Index " + end + " out of range 0 -> " + buf.length());
+          }
+          return buf.substring(start, end);
     }
  
-   public void adjustForDelete(int start, int length)
-   {
-
-          // length is expected to be the number of code-points,
-          // not the number of characters
+   /**
+    * Adjusts the internal string for deletinging
+    *  some characters within this.
+    * @param start The start position for the delete, in characters
+    * @param length The number of characters to delete
+    */
+   public void adjustForDelete(int start, int length) {
            int numChars = length;
-          if (usesUnicode()) {
-
-                  start /= 2;
-                  numChars = (length / 2);
-          }
  
            int myStart = getStart();
            int myEnd = getEnd();
@@ -121,9 +151,18 @@ public class TextPiece extends PropertyNode implements Comparable
            super.adjustForDelete(start, length);
     }
  
+   /**
+    * Returns the length, in characters
+    */
     public int characterLength()
     {
-     return (getEnd() - getStart()) / (_usesUnicode ? 2 : 1);
+     return (getEnd() - getStart());
+   }
+   /**
+    * Returns the length, in bytes
+    */
+   public int bytesLength() {
+          return (getEnd() - getStart()) * (_usesUnicode ? 2 : 1);
     }
  
     public boolean equals(Object o)
@@ -138,9 +177,11 @@ public class TextPiece extends PropertyNode implements Comparable
     }
  
  
+   /**
+    * Returns the character position we start at.
+    */
     public int getCP()
     {
-     return _cpStart;
+     return getStart();
     }
-
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java

index 39f0007b67a58e5f8471e599ff5bacf6e6dfeb64..5e903ecb8ae70793dc2f71dd472a20ecdac82c39 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@@ -28,6 +28,11 @@ import java.util.ArrayList;
  import java.util.List;
  
  /**
+ * The piece table for matching up character positions
+ *  to bits of text.
+ * This mostly works in bytes, but the TextPieces
+ *  themselves work in characters. This does the icky
+ *  convertion.
   * @author Ryan Ackley
   */
  public class TextPieceTable
@@ -36,8 +41,7 @@ public class TextPieceTable
    //int _multiple;
    int _cpMin;
  
-  public TextPieceTable()
-  {
+  public TextPieceTable() {
    }
  
    public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset,
@@ -47,7 +51,6 @@ public class TextPieceTable
      // get our plex of PieceDescriptors
      PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, PieceDescriptor.getSizeInBytes());
  
-    //_multiple = 2;
      int length = pieceTable.length();
      PieceDescriptor[] pieces = new PieceDescriptor[length];
  
@@ -57,11 +60,6 @@ public class TextPieceTable
      {
        GenericPropertyNode node = pieceTable.getProperty(x);
        pieces[x] = new PieceDescriptor(node.getBytes(), 0);
-
-//      if (!pieces[x].isUnicode())
-//      {
-//        _multiple = 1;
-//      }
      }
  
      int firstPieceFilePosition = pieces[0].getFilePosition();
@@ -72,26 +70,28 @@ public class TextPieceTable
      {
        int start = pieces[x].getFilePosition();
        PropertyNode node = pieceTable.getProperty(x);
-      int nodeStart = node.getStart();
-
-      // multiple will be 2 if there is only one piece and its unicode. Some
-      // type of optimization.
+      
+      // Grab the start and end, which are in characters
+      int nodeStartChars = node.getStart();
+      int nodeEndChars = node.getEnd();
+      
+      // What's the relationship between bytes and characters?
        boolean unicode = pieces[x].isUnicode();
-
        int multiple = 1;
-      if (unicode)
-      {
+      if (unicode) {
          multiple = 2;
        }
-      int nodeEnd = ((node.getEnd() - nodeStart) * multiple) + nodeStart;
-      int textSize = nodeEnd - nodeStart;
+      
+      // Figure out the length, in bytes and chars
+      int textSizeChars = (nodeEndChars - nodeStartChars);
+      int textSizeBytes = textSizeChars * multiple;
  
+      // Grab the data that makes up the piece
+      byte[] buf = new byte[textSizeBytes];
+      System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
  
-      byte[] buf = new byte[textSize];
-      System.arraycopy(documentStream, start, buf, 0, textSize);
-
-      int startFilePosition = start - firstPieceFilePosition;
-      _textPieces.add(new TextPiece(startFilePosition, startFilePosition+textSize, buf, pieces[x], node.getStart()));
+      // And now build the piece
+      _textPieces.add(new TextPiece(nodeStartChars, nodeEndChars, buf, pieces[x], node.getStart()));
      }
    }
  
@@ -113,7 +113,6 @@ public class TextPieceTable
      //int fcMin = docStream.getOffset();
  
      int size = _textPieces.size();
-    int bumpDown = 0;
      for (int x = 0; x < size; x++)
      {
        TextPiece next = (TextPiece)_textPieces.get(x);
@@ -134,47 +133,43 @@ public class TextPieceTable
  
        // write the text to the docstream and save the piece descriptor to the
        // plex which will be written later to the tableStream.
-      //if (_multiple == 1 && pd.isUnicode() &&
        docStream.write(next.getRawBytes());
  
+      // The TextPiece is already in characters, which
+      //  makes our life much easier
        int nodeStart = next.getStart();
-      int multiple = 1;
-      if (pd.isUnicode())
-      {
-        multiple = 2;
-      }
-      textPlex.addProperty(new GenericPropertyNode(nodeStart - bumpDown,
-        ((next.getEnd() - nodeStart)/multiple + nodeStart) - bumpDown,
+      int nodeEnd = next.getEnd();
+      textPlex.addProperty(new GenericPropertyNode(nodeStart, nodeEnd,
          pd.toByteArray()));
-
-      if (pd.isUnicode())
-      {
-        bumpDown += ((next.getEnd() - nodeStart)/multiple);
-      }
-
-
      }
  
      return textPlex.toByteArray();
  
    }
  
-
-  public int adjustForInsert(int listIndex, int length)
-  {
+  /**
+   * Adjust all the text piece after inserting
+   *  some text into one of them
+   * @param listIndex The TextPiece that had characters inserted into
+   * @param length The number of characters inserted
+   */
+  public int adjustForInsert(int listIndex, int length) {
      int size = _textPieces.size();
  
      TextPiece tp = (TextPiece)_textPieces.get(listIndex);
-
-    //The text piece stores the length on file.
-    length = length * (tp.usesUnicode() ? 2 : 1);
+    
+    // Update with the new end
      tp.setEnd(tp.getEnd() + length);
+    
+    // Now change all subsequent ones
      for (int x = listIndex + 1; x < size; x++)
      {
        tp = (TextPiece)_textPieces.get(x);
        tp.setStart(tp.getStart() + length);
        tp.setEnd(tp.getEnd() + length);
      }
+    
+    // All done
      return length;
    }
  
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java

index 0ef944f1366e3f1a4c3f0893ab4fd0a878c52be1..9838f9ef21102d16b3d7584bc409d2ca8f93f3bc 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@@ -137,7 +137,8 @@ public class Range
  
    /**
     * Used to construct a Range from a document. This is generally used to
-   * create a Range that spans the whole document.
+   * create a Range that spans the whole document, or at least one
+   * whole part of the document (eg main text, header, comment)
     *
     * @param start Starting character offset of the range.
     * @param end Ending character offset of the range.
@@ -259,15 +260,21 @@ public class Range
      for (int x = _textStart; x < _textEnd; x++)
      {
        TextPiece piece = (TextPiece)_text.get(x);
-      int start = _start > piece.getStart() ? _start - piece.getStart() : 0;
-      int end = _end <= piece.getEnd() ? _end - piece.getStart() : piece.getEnd() - piece.getStart();
-
-      if(piece.usesUnicode()) // convert the byte pointers to char pointers
-      {
-        start/=2;
-        end/=2;
+      
+      // Figure out where in this piece the text
+      //  we're after lives
+      int rStart = 0;
+      int rEnd = piece.characterLength();
+      if(_start > piece.getStart()) {
+         rStart = _start - piece.getStart();
        }
-      sb.append(piece.getStringBuffer().substring(start, end));
+      if(_end < piece.getEnd()) {
+         rEnd -= (piece.getEnd() - _end);
+      }
+      
+      // Luckily TextPieces work in characters, so we don't
+      //  need to worry about unicode here
+      sb.append(piece.substring(rStart, rEnd));
      }
      return sb.toString();
    }
@@ -929,9 +936,11 @@ public class Range
    }
  
    /**
-   *   Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
+   * Adjust the value of <code>FIB.CCPText</code> after an insert or a delete...
+   *
+   * TODO - handle other kinds of text, eg Headers
     *
-   *   @param  adjustment      The (signed) value that should be added to <code>FIB.CCPText</code>
+   * @param    adjustment      The (signed) value that should be added to <code>FIB.CCPText</code>
     */
    protected void adjustFIB(int adjustment) {
  
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java b/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java

index c0aa62569a7e53e98fc2a5b9af94e2b6f03c0dee..af0faf96c9f417963102bc9dc548bec299d7c038 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java
@@ -78,10 +78,13 @@ public class TestHWPFRangeParts extends TestCase {
         ;
         
         private static final String u_header =
+               "\r\r" +
                 "This is a simple header, with a \u20ac euro symbol in it.\r"
         ;
         private static final String u_footer =
-               "The footer, with Moli\u00e8re, has Unicode in it.\r"
+               "\r\r\r" +
+               "The footer, with Moli\u00e8re, has Unicode in it.\r" +
+               "\r\r\r\r"
         ;
         
         /**
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java

index 30e9ee138f670f6a416db5b6a03b46068becb9f3..eef4f0d5eb33866577a7a114a26e0c3339d71394 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java
@@ -18,19 +18,21 @@
          
  package org.apache.poi.hwpf.model;
  
-import junit.framework.*;
-
+import java.io.ByteArrayInputStream;
  import java.io.ByteArrayOutputStream;
-import java.util.ArrayList;
+import java.io.File;
+import java.io.FileInputStream;
+
+import junit.framework.TestCase;
  
-import org.apache.poi.hwpf.*;
-import org.apache.poi.hwpf.model.io.*;
+import org.apache.poi.hwpf.HWPFDocFixture;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.io.HWPFFileSystem;
  
  
-public class TestTextPieceTable
-  extends TestCase
-{
+public class TestTextPieceTable extends TestCase {
    private HWPFDocFixture _hWPFDocFixture;
+  private String dirname;
  
    public TestTextPieceTable(String name)
    {
@@ -63,9 +65,117 @@ public class TestTextPieceTable
      TextPieceTable newTextPieceTable = newCft.getTextPieceTable();
  
      assertEquals(oldTextPieceTable, newTextPieceTable);
-
-
    }
+
+       /**
+        * Check that we do the positions correctly when
+        *  working with pure-ascii
+        */
+       public void testAsciiParts() throws Exception {
+               HWPFDocument doc = new HWPFDocument(
+                               new FileInputStream(new File(dirname, "ThreeColHeadFoot.doc"))
+               );
+               TextPieceTable tbl = doc.getTextTable();
+               
+               // All ascii, so stored in one big lump
+               assertEquals(1, tbl.getTextPieces().size());
+               TextPiece tp = (TextPiece)tbl.getTextPieces().get(0);
+               
+               assertEquals(0, tp.getStart());
+               assertEquals(339, tp.getEnd());
+               assertEquals(339, tp.characterLength());
+               assertEquals(339, tp.bytesLength());
+               assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
+
+               
+               // Save and re-load
+               HWPFDocument docB = saveAndReload(doc);
+               tbl = docB.getTextTable();
+               
+               assertEquals(1, tbl.getTextPieces().size());
+               tp = (TextPiece)tbl.getTextPieces().get(0);
+               
+               assertEquals(0, tp.getStart());
+               assertEquals(339, tp.getEnd());
+               assertEquals(339, tp.characterLength());
+               assertEquals(339, tp.bytesLength());
+               assertTrue(tp.getStringBuffer().toString().startsWith("This is a sample word document"));
+       }
+
+       /**
+        * Check that we do the positions correctly when
+        *  working with a mix ascii, unicode file
+        */
+       public void testUnicodeParts() throws Exception {
+               HWPFDocument doc = new HWPFDocument(
+                               new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
+               );
+               TextPieceTable tbl = doc.getTextTable();
+               
+               // In three bits, split every 512 bytes
+               assertEquals(3, tbl.getTextPieces().size());
+               TextPiece tpA = (TextPiece)tbl.getTextPieces().get(0);
+               TextPiece tpB = (TextPiece)tbl.getTextPieces().get(1);
+               TextPiece tpC = (TextPiece)tbl.getTextPieces().get(2);
+               
+               assertTrue(tpA.usesUnicode());
+               assertTrue(tpB.usesUnicode());
+               assertTrue(tpC.usesUnicode());
+               
+               assertEquals(256, tpA.characterLength());
+               assertEquals(256, tpB.characterLength());
+               assertEquals(19, tpC.characterLength());
+               
+               assertEquals(512, tpA.bytesLength());
+               assertEquals(512, tpB.bytesLength());
+               assertEquals(38, tpC.bytesLength());
+               
+               assertEquals(0, tpA.getStart());
+               assertEquals(256, tpA.getEnd());
+               assertEquals(256, tpB.getStart());
+               assertEquals(512, tpB.getEnd());
+               assertEquals(512, tpC.getStart());
+               assertEquals(531, tpC.getEnd());
+               
+               
+               // Save and re-load
+               HWPFDocument docB = saveAndReload(doc);
+               tbl = docB.getTextTable();
+               
+               assertEquals(3, tbl.getTextPieces().size());
+               tpA = (TextPiece)tbl.getTextPieces().get(0);
+               tpB = (TextPiece)tbl.getTextPieces().get(1);
+               tpC = (TextPiece)tbl.getTextPieces().get(2);
+               
+               assertTrue(tpA.usesUnicode());
+               assertTrue(tpB.usesUnicode());
+               assertTrue(tpC.usesUnicode());
+               
+               assertEquals(256, tpA.characterLength());
+               assertEquals(256, tpB.characterLength());
+               assertEquals(19, tpC.characterLength());
+               
+               assertEquals(512, tpA.bytesLength());
+               assertEquals(512, tpB.bytesLength());
+               assertEquals(38, tpC.bytesLength());
+               
+               assertEquals(0, tpA.getStart());
+               assertEquals(256, tpA.getEnd());
+               assertEquals(256, tpB.getStart());
+               assertEquals(512, tpB.getEnd());
+               assertEquals(512, tpC.getStart());
+               assertEquals(531, tpC.getEnd());
+       }
+
+       protected HWPFDocument saveAndReload(HWPFDocument doc) throws Exception {
+               ByteArrayOutputStream baos = new ByteArrayOutputStream();
+               doc.write(baos);
+               
+               return new HWPFDocument(
+                               new ByteArrayInputStream(baos.toByteArray())
+               );
+       }
+  
    protected void setUp()
      throws Exception
    {
@@ -73,6 +183,8 @@ public class TestTextPieceTable
  
      _hWPFDocFixture = new HWPFDocFixture(this);
      _hWPFDocFixture.setUp();
+    
+    dirname = System.getProperty("HWPF.testdata.path");
    }
  
    protected void tearDown()
author	Nick Burch <nick@apache.org>
	Sat, 9 Aug 2008 19:34:38 +0000 (19:34 +0000)
committer	Nick Burch <nick@apache.org>
	Sat, 9 Aug 2008 19:34:38 +0000 (19:34 +0000)
src/scratchpad/src/org/apache/poi/hwpf/model/PropertyNode.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/TestHWPFRangeParts.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java		patch \| blob \| history