From: Yegor Kozlov Date: Fri, 19 Jun 2009 13:45:55 +0000 (+0000) Subject: improved HWPF to better handle unicode, patch provided by Benjamin Engele and Maxim... X-Git-Tag: REL_3_5-FINAL~99 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f7c0cc0c42399f56cc18514c9f18a9d4f14f9adb;p=poi.git improved HWPF to better handle unicode, patch provided by Benjamin Engele and Maxim Valyanskiy, see Bugzilla #46610 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@786505 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 764cb7375d..bf8b7bd95b 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,6 +33,7 @@ + 46610 - Improved HWPF to better handle unicode 47261 - Fixed SlideShow#removeSlide to remove references to Notes 47375 - Fixed HSSFHyperlink to correctly set inter-sheet and file links 47384 - Fixed ExternalNameRecord to handle unicode names diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java b/src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java index d4c0b1fb7a..1753fdbd91 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java @@ -25,37 +25,28 @@ package org.apache.poi.hwpf.model; * and characters. */ public abstract class BytePropertyNode extends PropertyNode { - private boolean isUnicode; + private final int startBytes; + private final int endBytes; /** * @param fcStart The start of the text for this property, in _bytes_ * @param fcEnd The end of the text for this property, in _bytes_ */ - public BytePropertyNode(int fcStart, int fcEnd, Object buf, boolean isUnicode) { + public BytePropertyNode(int fcStart, int fcEnd, CharIndexTranslator translator, Object buf) { super( - generateCp(fcStart, isUnicode), - generateCp(fcEnd, isUnicode), + translator.getCharIndex(fcStart), + translator.getCharIndex(fcEnd), buf ); - this.isUnicode = isUnicode; - } - private static int generateCp(int val, boolean isUnicode) { - if(isUnicode) - return val/2; - return val; + this.startBytes = fcStart; + this.endBytes = fcEnd; } - public boolean isUnicode() { - return isUnicode; - } public int getStartBytes() { - if(isUnicode) - return getStart()*2; - return getStart(); + return startBytes; } + public int getEndBytes() { - if(isUnicode) - return getEnd()*2; - return getEnd(); + return endBytes; } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java index 1374bd67d5..d8b51036b9 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java @@ -119,9 +119,8 @@ public final class CHPBinTable public void insert(int listIndex, int cpStart, SprmBuffer buf) { - boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart); - CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode); + CHPX insertChpx = new CHPX(0, 0, tpt,buf); // Ensure character offsets are really characters insertChpx.setStart(cpStart); @@ -141,7 +140,7 @@ public final class CHPBinTable // Original, until insert at point // New one // Clone of original, on to the old end - CHPX clone = new CHPX(0, 0, chpx.getSprmBuf(), needsToBeUnicode); + CHPX clone = new CHPX(0, 0, tpt,chpx.getSprmBuf()); // Again ensure contains character based offsets no matter what clone.setStart(cpStart); clone.setEnd(chpx.getEnd()); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java index fa24a78b0d..9f5d724bdd 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java @@ -60,8 +60,9 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage for (int x = 0; x < _crun; x++) { - boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) ); - _chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode)); + int startAt = getStart(x); + int endAt = getEnd(x); + _chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x))); } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java index f2dc4c3d7f..b78cdffc57 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java @@ -34,14 +34,14 @@ import org.apache.poi.hwpf.sprm.CharacterSprmUncompressor; public final class CHPX extends BytePropertyNode { - public CHPX(int fcStart, int fcEnd, byte[] grpprl, boolean isUnicode) + public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] grpprl) { - super(fcStart, fcEnd, new SprmBuffer(grpprl), isUnicode); + super(fcStart, fcEnd, translator, new SprmBuffer(grpprl)); } - public CHPX(int fcStart, int fcEnd, SprmBuffer buf, boolean isUnicode) + public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf) { - super(fcStart, fcEnd, buf, isUnicode); + super(fcStart, fcEnd, translator ,buf); } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java new file mode 100755 index 0000000000..d2cc0ebb4d --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java @@ -0,0 +1,40 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +public interface CharIndexTranslator { + + /** + * Calculates the char index of the given byte index. + * + * @param bytePos The character offset to check + * @return the char index + */ + int getCharIndex(int bytePos); + + /** + * Is the text at the given byte offset unicode, or plain old ascii? In a + * very evil fashion, you have to actually know this to make sense of + * character and paragraph properties :( + * + * @param bytePos The character offset to check about + * @return true if the text at the given byte offset is unicode + */ + boolean isUnicodeAtByteOffset(int bytePos); + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java index 66446dfaa1..1aaeec0cf2 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java @@ -76,9 +76,8 @@ public final class PAPBinTable public void insert(int listIndex, int cpStart, SprmBuffer buf) { - boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart); - PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode); + PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream); // Ensure character offsets are really characters forInsert.setStart(cpStart); @@ -108,7 +107,7 @@ public final class PAPBinTable // Original, until insert at point // New one // Clone of original, on to the old end - PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream, needsToBeUnicode); + PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream); // Again ensure contains character based offsets no matter what clone.setStart(cpStart); clone.setEnd(currentPap.getEnd()); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java index ab7f8155bf..755c37b407 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java @@ -62,14 +62,10 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt) { super(documentStream, offset); - for (int x = 0; x < _crun; x++) { - int startAt = getStart(x) - fcMin; - int endAt = getEnd(x) - fcMin; - boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt); - //System.err.println(startAt + " -> " + endAt + " = " + isUnicode); - - _papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode)); + int startAt = getStart(x); + int endAt = getEnd(x); + _papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream)); } _fkp = null; _dataStream = dataStream; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java index 73c1c8edd7..dcd2c18cf3 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java @@ -40,18 +40,18 @@ public final class PAPX extends BytePropertyNode { private ParagraphHeight _phe; private int _hugeGrpprlOffset = -1; - public PAPX(int fcStart, int fcEnd, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode) + public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] papx, ParagraphHeight phe, byte[] dataStream) { - super(fcStart, fcEnd, new SprmBuffer(papx), isUnicode); + super(fcStart, fcEnd, translator, new SprmBuffer(papx)); _phe = phe; SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream); if(buf != null) _buf = buf; } - public PAPX(int fcStart, int fcEnd, SprmBuffer buf, byte[] dataStream, boolean isUnicode) + public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, byte[] dataStream) { - super(fcStart, fcEnd, buf, isUnicode); + super(fcStart, fcEnd, translator, buf); _phe = new ParagraphHeight(); buf = findHuge(buf, dataStream); if(buf != null) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java index 7095f0dcf5..77030742d3 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java @@ -28,9 +28,9 @@ public final class SEPX extends BytePropertyNode SectionDescriptor _sed; - public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode) + public SEPX(SectionDescriptor sed, int start, int end, CharIndexTranslator translator, byte[] grpprl) { - super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode); + super(start, end, translator, SectionSprmUncompressor.uncompressSEP(grpprl, 0)); _sed = sed; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java index 2b15808c29..3b47c1e91c 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java @@ -61,13 +61,10 @@ public final class SectionTable int startAt = CPtoFC(node.getStart()); int endAt = CPtoFC(node.getEnd()); - boolean isUnicodeAtStart = tpt.isUnicodeAtByteOffset( startAt ); -// System.err.println(startAt + " -> " + endAt + " = " + isUnicodeAtStart); - // check for the optimization if (fileOffset == 0xffffffff) { - _sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart)); + _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0])); } else { @@ -76,7 +73,7 @@ public final class SectionTable byte[] buf = new byte[sepxSize]; fileOffset += LittleEndian.SHORT_SIZE; System.arraycopy(documentStream, fileOffset, buf, 0, buf.length); - _sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart)); + _sections.add(new SEPX(sed, startAt, endAt, tpt, buf)); } } @@ -138,33 +135,13 @@ public final class SectionTable } int FC = TP.getPieceDescriptor().getFilePosition(); int offset = CP - TP.getCP(); - FC = FC+offset-((TextPiece)_text.get(0)).getPieceDescriptor().getFilePosition(); + if (TP.isUnicode()) { + offset = offset*2; + } + FC = FC+offset; return FC; } - // Ryans code - private int FCtoCP(int fc) - { - int size = _text.size(); - int cp = 0; - for (int x = 0; x < size; x++) - { - TextPiece piece = (TextPiece)_text.get(x); - - if (fc <= piece.getEnd()) - { - cp += (fc - piece.getStart()); - break; - } - else - { - cp += (piece.getEnd() - piece.getStart()); - } - } - return cp; - } - - public ArrayList getSections() { return _sections; @@ -205,7 +182,7 @@ public final class SectionTable // Line using Ryan's FCtoCP() conversion method - // unable to observe any effect on our testcases when using this code - piers - GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStartBytes()), FCtoCP(sepx.getEndBytes()), sed.toByteArray()); + GenericPropertyNode property = new GenericPropertyNode(tpt.getCharIndex(sepx.getStartBytes()), tpt.getCharIndex(sepx.getEndBytes()), sed.toByteArray()); plex.addProperty(property); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java index 69e1f0ba79..16dd648c7c 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java @@ -37,7 +37,7 @@ import java.util.List; * convertion. * @author Ryan Ackley */ -public final class TextPieceTable +public final class TextPieceTable implements CharIndexTranslator { protected ArrayList _textPieces = new ArrayList(); //int _multiple; @@ -150,31 +150,25 @@ public final class TextPieceTable // If they ask off the end, just go with the last one... return lastWas; } - /** - * Is the text at the given byte offset - * unicode, or plain old ascii? - * In a very evil fashion, you have to actually - * know this to make sense of character and - * paragraph properties :( - * @param bytePos The character offset to check about - */ + public boolean isUnicodeAtByteOffset(int bytePos) { boolean lastWas = false; - int curByte = 0; + Iterator it = _textPieces.iterator(); while(it.hasNext()) { TextPiece tp = (TextPiece)it.next(); - int nextByte = curByte + tp.bytesLength(); + int curByte = tp.getPieceDescriptor().getFilePosition(); + int pieceEnd = curByte + tp.bytesLength(); // If the text piece covers the character, all good - if(curByte <= bytePos && nextByte >= bytePos) { + if(curByte <= bytePos && pieceEnd > bytePos) { return tp.isUnicode(); } // Otherwise keep track for the last one lastWas = tp.isUnicode(); // Move along - curByte = nextByte; + curByte = pieceEnd; } // If they ask off the end, just go with the last one... @@ -268,4 +262,34 @@ public final class TextPieceTable } return false; } + /* (non-Javadoc) + * @see org.apache.poi.hwpf.model.CharIndexTranslator#getLengthInChars(int) + */ + public int getCharIndex(int bytePos) { + int charCount = 0; + + Iterator it = _textPieces.iterator(); + while (it.hasNext()) { + TextPiece tp = (TextPiece) it.next(); + int pieceStart = tp.getPieceDescriptor().getFilePosition(); + if(pieceStart >= bytePos) { + break; + } + + int bytesLength = tp.bytesLength(); + int pieceEnd = pieceStart + bytesLength; + + int toAdd = bytePos > pieceEnd ? bytesLength : bytesLength + - (pieceEnd - bytePos); + + if (tp.isUnicode()) { + charCount += toAdd / 2; + } else { + charCount += toAdd; + } + } + + return charCount; + } + } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_1.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_1.doc new file mode 100755 index 0000000000..4291d9c1d6 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_1.doc differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_2.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_2.doc new file mode 100755 index 0000000000..be90831405 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_2.doc differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_3.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_3.doc new file mode 100755 index 0000000000..72d60df921 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_3.doc differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBug46610.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBug46610.java new file mode 100755 index 0000000000..f750a59a6d --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBug46610.java @@ -0,0 +1,72 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.usermodel; + +import junit.framework.TestCase; + +import java.io.FileInputStream; + +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.HWPFDocument; + +public class TestBug46610 extends TestCase { + private String dirname; + + protected void setUp() throws Exception { + dirname = System.getProperty("HWPF.testdata.path"); + } + + public void testUtf() throws Exception { + HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_1.doc")); + + runExtract(doc); + } + + public void testUtf2() throws Exception { + HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_2.doc")); + + runExtract(doc); + } + + public void testExtraction() throws Exception { + HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_3.doc")); + + String text = runExtract(doc); + + assertTrue(text.contains("\u0421\u0412\u041e\u042e")); + } + + private String runExtract(HWPFDocument doc) { + StringBuffer out = new StringBuffer(); + + Range globalRange = doc.getRange(); + for (int i = 0; i < globalRange.numParagraphs(); i++) { + Paragraph p = globalRange.getParagraph(i); + out.append(p.text()); + out.append("\n"); + for (int j = 0; j < p.numCharacterRuns(); j++) { + CharacterRun characterRun = p.getCharacterRun(j); + characterRun.text(); + } + } + + return out.toString(); + } +}