package org.apache.poi.hwpf.model;
+import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.Internal;
import org.apache.poi.util.NotImplemented;
public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
super(start, end, text, pd);
this.rawBytes = text;
- if (end < start) {
- throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
- }
}
+ @Override
+ protected void validateLengths(int start, int end, int length, PieceDescriptor pd) {
+ //things are still wonky with Big5 char/byte length mapping
+ //sometimes working w/ Java 8 but not w/ Java 7!
+ //for now, if we're dealing w/ Big5 don't bother checking
+ if (pd.getCharset() != null &&
+ CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(pd.getCharset())) {
+ return;
+ }
+ super.validateLengths(start, end, length, pd);
+ }
/**
* @return nothing, ever. Always throws an UnsupportedOperationException
* @throws UnsupportedOperationException
}
+ @Override
public StringBuilder getStringBuilder() {
return (StringBuilder) _buf;
}
// Validate
int textLength = ((CharSequence) _buf).length();
- if (end - start != textLength) {
- throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
- }
+ validateLengths(start, end, textLength, pd);
if (end < start) {
throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
}
}
+ protected void validateLengths(int start, int end, int textLength, PieceDescriptor pd) {
+ if (end - start != textLength) {
+ throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
+ }
+ }
/**
* Create the StringBuilder from the text and unicode flag
*/
assertContains(txt, "also maintain");//this is at a critical juncture
assertContains(txt, "which are available for");//this too
+ /*
+ The bytes for the following test:
+ 170 : 78 : x
+ 171 : 0 :
+ 172 : d : <r>
+ 173 : 35 : 5
+ 174 : 39 : 9
+ 175 : 0 :
+ 176 : 2d : -
+ 177 : 0 :
+ 178 : 35 : 5
+ 179 : 0 :
+ 180 : 35 : 5
+
+ Note that we are skipping over the value "5" at offset 173.
+ This is an apparently invalid sequence in MS's encoding scheme
+
+ When I open the document in MSWord, I also see "\r9-55"
+ */
+ assertContains(txt, "\n9-55 xxxxx block5");
//TODO: figure out why these two aren't passing
// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
// assertContains(txt, "We are able to");//not sure if we can get this easily?