<changes>
<release version="3.5-beta7" date="2009-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">46610 - Improved HWPF to better handle unicode</action>
<action dev="POI-DEVELOPERS" type="fix">47261 - Fixed SlideShow#removeSlide to remove references to Notes</action>
<action dev="POI-DEVELOPERS" type="fix">47375 - Fixed HSSFHyperlink to correctly set inter-sheet and file links</action>
<action dev="POI-DEVELOPERS" type="fix">47384 - Fixed ExternalNameRecord to handle unicode names</action>
* and characters.
*/
public abstract class BytePropertyNode extends PropertyNode {
- private boolean isUnicode;
+ private final int startBytes;
+ private final int endBytes;
/**
* @param fcStart The start of the text for this property, in _bytes_
* @param fcEnd The end of the text for this property, in _bytes_
*/
- public BytePropertyNode(int fcStart, int fcEnd, Object buf, boolean isUnicode) {
+ public BytePropertyNode(int fcStart, int fcEnd, CharIndexTranslator translator, Object buf) {
super(
- generateCp(fcStart, isUnicode),
- generateCp(fcEnd, isUnicode),
+ translator.getCharIndex(fcStart),
+ translator.getCharIndex(fcEnd),
buf
);
- this.isUnicode = isUnicode;
- }
- private static int generateCp(int val, boolean isUnicode) {
- if(isUnicode)
- return val/2;
- return val;
+ this.startBytes = fcStart;
+ this.endBytes = fcEnd;
}
- public boolean isUnicode() {
- return isUnicode;
- }
public int getStartBytes() {
- if(isUnicode)
- return getStart()*2;
- return getStart();
+ return startBytes;
}
+
public int getEndBytes() {
- if(isUnicode)
- return getEnd()*2;
- return getEnd();
+ return endBytes;
}
}
public void insert(int listIndex, int cpStart, SprmBuffer buf)
{
- boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
- CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode);
+ CHPX insertChpx = new CHPX(0, 0, tpt,buf);
// Ensure character offsets are really characters
insertChpx.setStart(cpStart);
// Original, until insert at point
// New one
// Clone of original, on to the old end
- CHPX clone = new CHPX(0, 0, chpx.getSprmBuf(), needsToBeUnicode);
+ CHPX clone = new CHPX(0, 0, tpt,chpx.getSprmBuf());
// Again ensure contains character based offsets no matter what
clone.setStart(cpStart);
clone.setEnd(chpx.getEnd());
for (int x = 0; x < _crun; x++)
{
- boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) );
- _chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode));
+ int startAt = getStart(x);
+ int endAt = getEnd(x);
+ _chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x)));
}
}
public final class CHPX extends BytePropertyNode
{
- public CHPX(int fcStart, int fcEnd, byte[] grpprl, boolean isUnicode)
+ public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] grpprl)
{
- super(fcStart, fcEnd, new SprmBuffer(grpprl), isUnicode);
+ super(fcStart, fcEnd, translator, new SprmBuffer(grpprl));
}
- public CHPX(int fcStart, int fcEnd, SprmBuffer buf, boolean isUnicode)
+ public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf)
{
- super(fcStart, fcEnd, buf, isUnicode);
+ super(fcStart, fcEnd, translator ,buf);
}
--- /dev/null
+/* ====================================================================\r
+ Licensed to the Apache Software Foundation (ASF) under one or more\r
+ contributor license agreements. See the NOTICE file distributed with\r
+ this work for additional information regarding copyright ownership.\r
+ The ASF licenses this file to You under the Apache License, Version 2.0\r
+ (the "License"); you may not use this file except in compliance with\r
+ the License. You may obtain a copy of the License at\r
+\r
+ http://www.apache.org/licenses/LICENSE-2.0\r
+\r
+ Unless required by applicable law or agreed to in writing, software\r
+ distributed under the License is distributed on an "AS IS" BASIS,\r
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
+ See the License for the specific language governing permissions and\r
+ limitations under the License.\r
+==================================================================== */\r
+\r
+package org.apache.poi.hwpf.model;\r
+\r
+public interface CharIndexTranslator {\r
+\r
+ /**\r
+ * Calculates the char index of the given byte index.\r
+ *\r
+ * @param bytePos The character offset to check \r
+ * @return the char index\r
+ */\r
+ int getCharIndex(int bytePos);\r
+\r
+ /**\r
+ * Is the text at the given byte offset unicode, or plain old ascii? In a\r
+ * very evil fashion, you have to actually know this to make sense of\r
+ * character and paragraph properties :(\r
+ *\r
+ * @param bytePos The character offset to check about\r
+ * @return true if the text at the given byte offset is unicode\r
+ */\r
+ boolean isUnicodeAtByteOffset(int bytePos);\r
+\r
+}\r
public void insert(int listIndex, int cpStart, SprmBuffer buf)
{
- boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
- PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode);
+ PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream);
// Ensure character offsets are really characters
forInsert.setStart(cpStart);
// Original, until insert at point
// New one
// Clone of original, on to the old end
- PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream, needsToBeUnicode);
+ PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream);
// Again ensure contains character based offsets no matter what
clone.setStart(cpStart);
clone.setEnd(currentPap.getEnd());
public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt)
{
super(documentStream, offset);
-
for (int x = 0; x < _crun; x++) {
- int startAt = getStart(x) - fcMin;
- int endAt = getEnd(x) - fcMin;
- boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt);
- //System.err.println(startAt + " -> " + endAt + " = " + isUnicode);
-
- _papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode));
+ int startAt = getStart(x);
+ int endAt = getEnd(x);
+ _papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream));
}
_fkp = null;
_dataStream = dataStream;
private ParagraphHeight _phe;
private int _hugeGrpprlOffset = -1;
- public PAPX(int fcStart, int fcEnd, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode)
+ public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] papx, ParagraphHeight phe, byte[] dataStream)
{
- super(fcStart, fcEnd, new SprmBuffer(papx), isUnicode);
+ super(fcStart, fcEnd, translator, new SprmBuffer(papx));
_phe = phe;
SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream);
if(buf != null)
_buf = buf;
}
- public PAPX(int fcStart, int fcEnd, SprmBuffer buf, byte[] dataStream, boolean isUnicode)
+ public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, byte[] dataStream)
{
- super(fcStart, fcEnd, buf, isUnicode);
+ super(fcStart, fcEnd, translator, buf);
_phe = new ParagraphHeight();
buf = findHuge(buf, dataStream);
if(buf != null)
SectionDescriptor _sed;
- public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode)
+ public SEPX(SectionDescriptor sed, int start, int end, CharIndexTranslator translator, byte[] grpprl)
{
- super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode);
+ super(start, end, translator, SectionSprmUncompressor.uncompressSEP(grpprl, 0));
_sed = sed;
}
int startAt = CPtoFC(node.getStart());
int endAt = CPtoFC(node.getEnd());
- boolean isUnicodeAtStart = tpt.isUnicodeAtByteOffset( startAt );
-// System.err.println(startAt + " -> " + endAt + " = " + isUnicodeAtStart);
-
// check for the optimization
if (fileOffset == 0xffffffff)
{
- _sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart));
+ _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
}
else
{
byte[] buf = new byte[sepxSize];
fileOffset += LittleEndian.SHORT_SIZE;
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
- _sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart));
+ _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
}
}
}
int FC = TP.getPieceDescriptor().getFilePosition();
int offset = CP - TP.getCP();
- FC = FC+offset-((TextPiece)_text.get(0)).getPieceDescriptor().getFilePosition();
+ if (TP.isUnicode()) {
+ offset = offset*2;
+ }
+ FC = FC+offset;
return FC;
}
- // Ryans code
- private int FCtoCP(int fc)
- {
- int size = _text.size();
- int cp = 0;
- for (int x = 0; x < size; x++)
- {
- TextPiece piece = (TextPiece)_text.get(x);
-
- if (fc <= piece.getEnd())
- {
- cp += (fc - piece.getStart());
- break;
- }
- else
- {
- cp += (piece.getEnd() - piece.getStart());
- }
- }
- return cp;
- }
-
-
public ArrayList getSections()
{
return _sections;
// Line using Ryan's FCtoCP() conversion method -
// unable to observe any effect on our testcases when using this code - piers
- GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStartBytes()), FCtoCP(sepx.getEndBytes()), sed.toByteArray());
+ GenericPropertyNode property = new GenericPropertyNode(tpt.getCharIndex(sepx.getStartBytes()), tpt.getCharIndex(sepx.getEndBytes()), sed.toByteArray());
plex.addProperty(property);
* convertion.
* @author Ryan Ackley
*/
-public final class TextPieceTable
+public final class TextPieceTable implements CharIndexTranslator
{
protected ArrayList _textPieces = new ArrayList();
//int _multiple;
// If they ask off the end, just go with the last one...
return lastWas;
}
- /**
- * Is the text at the given byte offset
- * unicode, or plain old ascii?
- * In a very evil fashion, you have to actually
- * know this to make sense of character and
- * paragraph properties :(
- * @param bytePos The character offset to check about
- */
+
public boolean isUnicodeAtByteOffset(int bytePos) {
boolean lastWas = false;
- int curByte = 0;
+
Iterator it = _textPieces.iterator();
while(it.hasNext()) {
TextPiece tp = (TextPiece)it.next();
- int nextByte = curByte + tp.bytesLength();
+ int curByte = tp.getPieceDescriptor().getFilePosition();
+ int pieceEnd = curByte + tp.bytesLength();
// If the text piece covers the character, all good
- if(curByte <= bytePos && nextByte >= bytePos) {
+ if(curByte <= bytePos && pieceEnd > bytePos) {
return tp.isUnicode();
}
// Otherwise keep track for the last one
lastWas = tp.isUnicode();
// Move along
- curByte = nextByte;
+ curByte = pieceEnd;
}
// If they ask off the end, just go with the last one...
}
return false;
}
+ /* (non-Javadoc)
+ * @see org.apache.poi.hwpf.model.CharIndexTranslator#getLengthInChars(int)
+ */
+ public int getCharIndex(int bytePos) {
+ int charCount = 0;
+
+ Iterator it = _textPieces.iterator();
+ while (it.hasNext()) {
+ TextPiece tp = (TextPiece) it.next();
+ int pieceStart = tp.getPieceDescriptor().getFilePosition();
+ if(pieceStart >= bytePos) {
+ break;
+ }
+
+ int bytesLength = tp.bytesLength();
+ int pieceEnd = pieceStart + bytesLength;
+
+ int toAdd = bytePos > pieceEnd ? bytesLength : bytesLength
+ - (pieceEnd - bytePos);
+
+ if (tp.isUnicode()) {
+ charCount += toAdd / 2;
+ } else {
+ charCount += toAdd;
+ }
+ }
+
+ return charCount;
+ }
+
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.usermodel;
+
+import junit.framework.TestCase;
+
+import java.io.FileInputStream;
+
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.HWPFDocument;
+
+public class TestBug46610 extends TestCase {
+ private String dirname;
+
+ protected void setUp() throws Exception {
+ dirname = System.getProperty("HWPF.testdata.path");
+ }
+
+ public void testUtf() throws Exception {
+ HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_1.doc"));
+
+ runExtract(doc);
+ }
+
+ public void testUtf2() throws Exception {
+ HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_2.doc"));
+
+ runExtract(doc);
+ }
+
+ public void testExtraction() throws Exception {
+ HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_3.doc"));
+
+ String text = runExtract(doc);
+
+ assertTrue(text.contains("\u0421\u0412\u041e\u042e"));
+ }
+
+ private String runExtract(HWPFDocument doc) {
+ StringBuffer out = new StringBuffer();
+
+ Range globalRange = doc.getRange();
+ for (int i = 0; i < globalRange.numParagraphs(); i++) {
+ Paragraph p = globalRange.getParagraph(i);
+ out.append(p.text());
+ out.append("\n");
+ for (int j = 0; j < p.numCharacterRuns(); j++) {
+ CharacterRun characterRun = p.getCharacterRun(j);
+ characterRun.text();
+ }
+ }
+
+ return out.toString();
+ }
+}