git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790061 13f79535-47bb-0310-9956-ffa450edef68

7年前 · 6fe3b75bfd
--- a/src/integrationtest/org/apache/poi/TestAllFiles.java
+++ b/src/integrationtest/org/apache/poi/TestAllFiles.java
        "document/Word6_sections2.doc",
        "document/Word95.doc",
        "document/word95err.doc",
        "document/Bug60936.doc",
        "document/Bug60942.doc",
        "document/Bug60942b.doc",
        "hpsf/TestMickey.doc",
        "document/52117.doc"
    );
--- a/src/java/org/apache/poi/util/CodePageUtil.java
+++ b/src/java/org/apache/poi/util/CodePageUtil.java
 package org.apache.poi.util;
 import java.io.UnsupportedEncodingException;
 import java.nio.charset.Charset;
 import java.util.HashSet;
 import java.util.Set;
 /**
 * Utilities for working with Microsoft CodePages.
 */
 public class CodePageUtil
 {
    public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>();
    static {
        //others?
        VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
    }
    /** <p>Codepage 037, a special case</p> */
    public static final int CP_037 = 37;
--- a/src/java/org/apache/poi/util/LittleEndianBig5Stream.java
+++ b/src/java/org/apache/poi/util/LittleEndianBig5Stream.java
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.util;
 import java.io.ByteArrayInputStream;
 /**
 * Stream that converts MSOffice's way of storing Big5, with
 * zero-byte padding for ASCII and in LittleEndianOrder.
 */
@Internal
 public class LittleEndianBig5Stream extends ByteArrayInputStream {
    private static final int EOF = -1;
    private static final int INVALID_PAIR = -2;
    private static final int EMPTY_TRAILING = -3;
    //the char that is logically trailing in Big5 encoding
    //however in LittleEndian order, this is the first encountered.
    int trailing = EMPTY_TRAILING;
    public LittleEndianBig5Stream(byte[] buf) {
        super(buf);
    }
    public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
        super(buf, offset, length);
    }
    @Override
    public int read() {
        if (trailing != EMPTY_TRAILING) {
            int tmp = trailing;
            trailing = EMPTY_TRAILING;
            return tmp;
        }
        int leading = readNext();
        while (leading == INVALID_PAIR) {
            leading = readNext();
        }
        if (leading == EOF) {
            return EOF;
        }
        return leading;
    }
    //returns leading, sets trailing appropriately
    //returns -1 if it hits the end of the stream
    //returns -2 for an invalid big5 code pair
    private final int readNext() {
        trailing = super.read();
        if (trailing == -1) {
            return EOF;
        }
        int leading = super.read();
        if (leading == EOF) {
            return EOF;
        }
        int lead = leading&0xff;
        if (lead > 0x80) {
            return leading;
        } else if (lead == 0) {
            int ret = trailing;
            trailing = EMPTY_TRAILING;
            return ret;
        } else {
            int ret = trailing;
            trailing = EMPTY_TRAILING;
            return ret;
            //return INVALID_PAIR;
        }
    }
    @Override
    public int read(byte[] buff, int off, int len) {
        int bytesRead = 0;
        for (int i = off; i < off+len; i++) {
            int b = read();
            if (b == -1) {
                if (bytesRead == 0) {
                    return -1;
                } else {
                    return bytesRead;
                }
            }
            bytesRead++;
            buff[i] = (byte)b;
        }
        return bytesRead;
    }
 }
--- a/src/java/org/apache/poi/util/StringUtil.java
+++ b/src/java/org/apache/poi/util/StringUtil.java
 package org.apache.poi.util;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.Iterator;
 */
@Internal
 public class StringUtil {
    private static final POILogger logger = POILogFactory
            .getLogger(StringUtil.class);
    protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
    protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
    public static final Charset UTF16LE = Charset.forName("UTF-16LE");
    public static final Charset UTF8 = Charset.forName("UTF-8");
    public static final Charset WIN_1252 = Charset.forName("cp1252");
    public static final Charset BIG5 = Charset.forName("Big5");
    private static Map<Integer,Integer> msCodepointToUnicode;
       9133, // 0xf0fe bracerightbt
       ' ', // 0xf0ff not defined
   };
    /**
     * This tries to convert a LE byte array in Big5 to a String.
     * We know MS zero-padded ascii, and we drop those.
     * However, there may be areas for improvement in this.
     *
     * @param data
     * @param offset
     * @param lengthInBytes
     * @return
     */
   public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) {
       ByteArrayOutputStream os = new ByteArrayOutputStream();
       try {
           IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os);
       } catch (IOException e) {
           logger.log(POILogger.WARN,
                   "IOException while copying a byte array stream to a byte array stream?!");
       }
       return new String(os.toByteArray(), BIG5);
   }
   // Could be replaced with org.apache.commons.lang3.StringUtils#join
   @Internal
   public static String join(Object[] array, String separator) {
--- a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
+++ b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java
            return charset;
        }
        static WmfCharset valueOf(int flag) {
        public static WmfCharset valueOf(int flag) {
            for (WmfCharset cs : values()) {
                if (cs.flag == flag) return cs;
            }
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.charset.Charset;
 import org.apache.poi.hwmf.record.HwmfFont;
 import org.apache.poi.hwpf.model.ComplexFileTable;
 import org.apache.poi.hwpf.model.FontTable;
 import org.apache.poi.hwpf.model.OldCHPBinTable;
 import org.apache.poi.hwpf.model.OldComplexFileTable;
 import org.apache.poi.hwpf.model.OldFfn;
 import org.apache.poi.hwpf.model.OldFontTable;
 import org.apache.poi.hwpf.model.OldPAPBinTable;
 import org.apache.poi.hwpf.model.OldSectionTable;
 import org.apache.poi.hwpf.model.OldTextPieceTable;
 import org.apache.poi.hwpf.model.PieceDescriptor;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.model.TextPieceTable;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.CodePageUtil;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.NotImplemented;
 import org.apache.poi.util.StringUtil;
 /**
 * Provides very simple support for old (Word 6 / Word 95)
 *  files.
 */
 public class HWPFOldDocument extends HWPFDocumentCore {
    private TextPieceTable tpt;
    private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
    private OldTextPieceTable tpt;
    private StringBuilder _text;
    private final OldFontTable fontTable;
    private final Charset guessedCharset;
    public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
        this(fs.getRoot());
        int chpTableSize   = LittleEndian.getInt(_mainStream, 0xbc);
        int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
        int papTableSize   = LittleEndian.getInt(_mainStream, 0xc4);
        //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
        //int shfTableSize   = LittleEndian.getInt(_mainStream, 0x64);
        int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
        int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
        fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize);
        //TODO: figure out how to map runs/text pieces to fonts
        //for now, if there's a non standard codepage in one of the fonts
        //assume that the doc is in that codepage.
        guessedCharset = guessCodePage(fontTable);
        int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
        // We need to get hold of the text that makes up the
        //  document, which might be regular or fast-saved
        ComplexFileTable cft = null;
        StringBuffer text = new StringBuffer();
        if(_fib.getFibBase().isFComplex()) {
            cft = new ComplexFileTable(
            cft = new OldComplexFileTable(
                    _mainStream, _mainStream,
                    complexTableOffset, _fib.getFibBase().getFcMin()
                    complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset
            );
            tpt = cft.getTextPieceTable();
            tpt = (OldTextPieceTable)cft.getTextPieceTable();
            for(TextPiece tp : tpt.getTextPieces()) {
                text.append( tp.getStringBuilder() );
            }
        } else {
            // TODO Discover if these older documents can ever hold Unicode Strings?
            //  (We think not, because they seem to lack a Piece table)
            // TODO Build the Piece Descriptor properly
            //  (We have to fake it, as they don't seem to have a proper Piece table)
            PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
            PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
            pd.setFilePosition(_fib.getFibBase().getFcMin());
            // Generate a single Text Piece Table, with a single Text Piece
            //  which covers all the (8 bit only) text in the file
            tpt = new TextPieceTable();
            tpt = new OldTextPieceTable();
            byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
            System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
            int numChars = textData.length;
            if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
                numChars /= 2;
            }
            TextPiece tp = new TextPiece(
                    0, textData.length, textData, pd
                    0, numChars, textData, pd
            );
            tpt.add(tp);
            text.append(tp.getStringBuilder());
        }
        _text = tpt.getText();
        // Now we can fetch the character and paragraph properties
        }
    }
    /**
     * Take the first codepage that is not default, ansi or symbol.
     * Ideally, we'd want to track fonts with runs, but we don't yet
     * know how to do that.
     *
     * Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi
     * appears here.
     *
     * @param fontTable
     * @return
     */
    private Charset guessCodePage(OldFontTable fontTable) {
        for (OldFfn oldFfn : fontTable.getFontNames()) {
            HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
            if (wmfCharset != null &&
                    wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
                    wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
                    wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
                return wmfCharset.getCharset();
            }
        }
        return DEFAULT_CHARSET;
    }
    public Range getOverallRange()
    {
        // Life is easy when we have no footers, headers or unicode!
        return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this );
    }
    /**
     * Use {@link #getOldFontTable()} instead!!!
     * This always throws an IllegalArgumentException.
     *
     * @return nothing
     * @throws UnsupportedOperationException
     */
    @Override
    @NotImplemented
    public FontTable getFontTable() {
        throw new UnsupportedOperationException("Use getOldFontTable instead.");
    }
    public OldFontTable getOldFontTable() {
        return fontTable;
    }
    public Range getRange()
    {
        return getOverallRange();
    public void write(OutputStream out) throws IOException {
        throw new IllegalStateException("Writing is not available for the older file formats");
    }
    /**
     * As a rough heuristic (total hack), read through the font table
     * and take the first non-default, non-ansi, non-symbol
     * font's charset and return that.
     *
     * Once we figure out how to link a font to a text piece, we should
     * use the font information per text piece.
     *
     * @return charset
     */
    public Charset getGuessedCharset() {
        return guessedCharset;
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java
 package org.apache.poi.hwpf.model;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.LinkedList;
 import java.util.List;
 import org.apache.poi.hwpf.sprm.SprmBuffer;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.StringUtil;
@Internal
 public final class ComplexFileTable {
 public class ComplexFileTable {
    private static final byte GRPPRL_TYPE = 1;
    private static final byte TEXT_PIECE_TABLE_TYPE = 2;
        _tpt = new TextPieceTable();
    }
    public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
    protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin,
                               Charset charset) throws IOException {
        //skips through the prms before we reach the piece table. These contain data
        //for actual fast saved files
        List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
        }
        int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
        offset += LittleEndian.INT_SIZE;
        _tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
        _tpt = newTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset);
    }
    public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
        this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252);
    }
    public TextPieceTable getTextPieceTable() {
        tableStream.write(table);
    }
    protected TextPieceTable newTextPieceTable(byte[] documentStream,
                                               byte[] tableStream, int offset, int pieceTableSize, int fcMin,
                                               Charset charset) {
        return new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
   * @param fcMin
   */
  public OldCHPBinTable(byte[] documentStream, int offset,
                     int size, int fcMin, TextPieceTable tpt)
                     int size, int fcMin, OldTextPieceTable tpt)
  {
    PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hwpf.model;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import org.apache.poi.util.Internal;
@Internal
 public final class OldComplexFileTable extends ComplexFileTable {
    public OldComplexFileTable(byte[] documentStream, byte[] tableStream,
                               int offset, int fcMin, Charset charset) throws IOException {
        super(documentStream, tableStream, offset, fcMin, charset);
    }
    @Override
    protected TextPieceTable newTextPieceTable(byte[] documentStream,
                                               byte[] tableStream, int offset,
                                               int pieceTableSize, int fcMin, Charset charset) {
        return new OldTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset);
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hwpf.model;
 import java.nio.charset.Charset;
 import org.apache.poi.hwmf.record.HwmfFont;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
 import org.apache.poi.util.StringUtil;
 /**
 * Word 6.0 Font information
 */
@Internal
 public final class OldFfn {
    private static final POILogger logger = POILogFactory.getLogger(OldFfn.class);
    private byte _chs;// character set identifier
    private final String fontName;
    private final String altFontName;
    private final int length; //length in bytes for this record
    /**
     * try to read an OldFfn starting at offset; read no farther than end
     *
     * @param buf          buffer from which to read
     * @param offset       offset at which to start
     * @param fontTableEnd read no farther than this
     * @return an OldFfn or null if asked to read beyond end
     */
    static OldFfn build(byte[] buf, int offset, int fontTableEnd) {
        int start = offset;
        //preliminary bytes
        if (offset + 6 > fontTableEnd) {
            return null;
        }
        //first byte
        short fontDescriptionLength = (short) buf[offset];
        offset += 1;
        if (offset + fontDescriptionLength > fontTableEnd) {
            logger.log(POILogger.WARN, "Asked to read beyond font table end. Skipping font");
            return null;
        }
        //no idea what these 3 bytes do
        offset += 3;
        byte chs = buf[offset];
        Charset charset = null;
        HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 0xff);
        if (wmfCharset == null) {
            logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs & 0xff));
        } else {
            charset = wmfCharset.getCharset();
        }
        charset = charset == null ? StringUtil.WIN_1252 : charset;
        offset += LittleEndian.BYTE_SIZE;
        //if this byte here == 7, it _may_ signify existence of
        //an altername font name
        //not sure what the byte after the _chs does
        offset += LittleEndian.BYTE_SIZE;
        int fontNameLength = -1;
        for (int i = offset; i < fontTableEnd; i++) {
            if (buf[i] == 0) {
                fontNameLength = i - offset;
                break;
            }
        }
        if (fontNameLength == -1) {
            logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited font name length");
            return null;
        }
        String fontName = new String(buf, offset, fontNameLength, charset);
        String altFontName = null;
        int altFontNameLength = -1;
        offset += fontNameLength + 1;
        if (offset - start < fontDescriptionLength) {
            for (int i = offset; i <= start + fontDescriptionLength; i++) {
                if (buf[i] == 0) {
                    altFontNameLength = i - offset;
                    break;
                }
            }
            if (altFontNameLength > -1) {
                altFontName = new String(buf, offset, altFontNameLength, charset);
            }
        }
        //reset to 0 for length calculation
        altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 1;//add one for zero byte
        int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + LittleEndian.BYTE_SIZE +//6 starting bytes
                fontNameLength + altFontNameLength + 1;//+1 is for the zero byte
        //this len should == fontDescriptionLength
        return new OldFfn(chs, fontName, altFontName, len);
    }
    public OldFfn(byte charsetIdentifier, String fontName, String altFontName, int length) {
        this._chs = charsetIdentifier;
        this.fontName = fontName;
        this.altFontName = altFontName;
        this.length = length;
    }
    public byte getChs() {
        return _chs;
    }
    public String getMainFontName() {
        return fontName;
    }
    /**
     * @return altFontName if it exists, null otherwise
     */
    public String getAltFontName() {
        return altFontName;
    }
    /**
     * @return length in bytes for this record
     */
    public int getLength() {
        return length;
    }
    @Override
    public String toString() {
        return "OldFfn{" +
                "_chs=" + (_chs & 0xff) +
                ", fontName='" + fontName + '\'' +
                ", altFontName='" + altFontName + '\'' +
                ", length=" + length +
                '}';
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java
 /* ====================================================================
     Licensed to the Apache Software Foundation (ASF) under one or more
     contributor license agreements.    See the NOTICE file distributed with
     this work for additional information regarding copyright ownership.
     The ASF licenses this file to You under the Apache License, Version 2.0
     (the "License"); you may not use this file except in compliance with
     the License.    You may obtain a copy of the License at
             http://www.apache.org/licenses/LICENSE-2.0
     Unless required by applicable law or agreed to in writing, software
     distributed under the License is distributed on an "AS IS" BASIS,
     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     See the License for the specific language governing permissions and
     limitations under the License.
 ==================================================================== */
 package org.apache.poi.hwpf.model;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
 /**
 * Font table for Word 6.0
 */
@Internal
 public final class OldFontTable {
    private final static POILogger _logger = POILogFactory.getLogger(OldFontTable.class);
    // added extra facilitator members
    // FFN structure containing strings of font names
    private final OldFfn[] _fontNames;
    public OldFontTable(byte[] buf, int offset, int length) {
        //length is stored at the index section in the table
        //and it is recorded in the first short.
        List<OldFfn> ffns = new ArrayList<OldFfn>();
        int fontTableLength = LittleEndian.getShort(buf, offset);
        int endOfTableOffset = offset + length;
        int startOffset = offset + LittleEndian.SHORT_SIZE;//first short should == length!
        while (true) {
            OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset);
            if (oldFfn == null) {
                break;
            }
            ffns.add(oldFfn);
            startOffset += oldFfn.getLength();
        }
        _fontNames = ffns.toArray(new OldFfn[ffns.size()]);
    }
    public OldFfn[] getFontNames() {
        return _fontNames;
    }
    public String getMainFont(int chpFtc) {
        if (chpFtc >= _fontNames.length) {
            _logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount");
            return null;
        }
        return _fontNames[chpFtc].getMainFontName();
    }
    @Override
    public String toString() {
        return "OldFontTable{" +
                "_fontNames=" + Arrays.toString(_fontNames) +
                '}';
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hwpf.model;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.NotImplemented;
 /**
 * Lightweight representation of a text piece.
 * Works in the character domain, not the byte domain, so you
 * need to have turned byte references into character
 * references before getting here.
 */
@Internal
 public class OldTextPiece extends TextPiece {
    private final byte[] rawBytes;
    /**
     * @param start Beginning offset in main document stream, in characters.
     * @param end   Ending offset in main document stream, in characters.
     * @param text  The raw bytes of our text
     */
    public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
        super(start, end, text, pd);
        this.rawBytes = text;
        if (end < start) {
            throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
        }
    }
    /**
     * @return nothing, ever. Always throws an UnsupportedOperationException
     * @throws UnsupportedOperationException
     */
    @NotImplemented
    @Override
    public boolean isUnicode() {
        throw new UnsupportedOperationException();
    }
    public StringBuilder getStringBuilder() {
        return (StringBuilder) _buf;
    }
    @Override
    public byte[] getRawBytes() {
        byte[] buf = new byte[rawBytes.length];
        System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length);
        return buf;
    }
    /**
     * Returns part of the string.
     * Works only in characters, not in bytes!
     *
     * @param start Local start position, in characters
     * @param end   Local end position, in characters
     * @throws UnsupportedOperationException
     */
    @Deprecated
    @NotImplemented
    public String substring(int start, int end) {
        throw new UnsupportedOperationException();
    }
    /**
     * Not implemented for OldTextPiece.
     * Always throws UnsupportedOperationException
     */
    @Deprecated
    @NotImplemented
    public void adjustForDelete(int start, int length) {
        throw new UnsupportedOperationException();
    }
    /**
     * Returns the length, in bytes
     */
    public int bytesLength() {
        return rawBytes.length;
    }
    @Override
    public int hashCode() {
        assert false : "hashCode not designed";
        return 42; // any arbitrary constant will do
    }
    /**
     * Returns the character position we start at.
     */
    public int getCP() {
        return getStart();
    }
    public String toString() {
        return "OldTextPiece from " + getStart() + " to " + getEnd() + " ("
                + getPieceDescriptor() + ")";
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hwpf.model;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collections;
 import org.apache.poi.util.CodePageUtil;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
@Internal
 public class OldTextPieceTable extends TextPieceTable {
    private static final POILogger logger = POILogFactory
            .getLogger(OldTextPieceTable.class);
    public OldTextPieceTable() {
        super();
    }
    public OldTextPieceTable(byte[] documentStream, byte[] tableStream,
                             int offset, int size, int fcMin, Charset charset) {
        //super(documentStream, tableStream, offset, size, fcMin, charset);
        // get our plex of PieceDescriptors
        PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size,
                PieceDescriptor.getSizeInBytes());
        int length = pieceTable.length();
        PieceDescriptor[] pieces = new PieceDescriptor[length];
        // iterate through piece descriptors raw bytes and create
        // PieceDescriptor objects
        for (int x = 0; x < length; x++) {
            GenericPropertyNode node = pieceTable.getProperty(x);
            pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset);
        }
        // Figure out the cp of the earliest text piece
        // Note that text pieces don't have to be stored in order!
        _cpMin = pieces[0].getFilePosition() - fcMin;
        for (PieceDescriptor piece : pieces) {
            int start = piece.getFilePosition() - fcMin;
            if (start < _cpMin) {
                _cpMin = start;
            }
        }
        // using the PieceDescriptors, build our list of TextPieces.
        for (int x = 0; x < pieces.length; x++) {
            int start = pieces[x].getFilePosition();
            GenericPropertyNode node = pieceTable.getProperty(x);
            // Grab the start and end, which are in characters
            int nodeStartChars = node.getStart();
            int nodeEndChars = node.getEnd();
            // What's the relationship between bytes and characters?
            boolean unicode = pieces[x].isUnicode();
            int multiple = 1;
            if (unicode ||
                    (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
                multiple = 2;
            }
            // Figure out the length, in bytes and chars
            int textSizeChars = (nodeEndChars - nodeStartChars);
            int textSizeBytes = textSizeChars * multiple;
            // Grab the data that makes up the piece
            byte[] buf = new byte[textSizeBytes];
            System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
            // And now build the piece
            final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf,
                    pieces[x]);
            _textPieces.add(newTextPiece);
        }
        // In the interest of our sanity, now sort the text pieces
        // into order, if they're not already
        Collections.sort(_textPieces);
        _textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces);
        Collections.sort(_textPiecesFCOrder, new FCComparator());
    }
    @Override
    protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) {
        return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd);
    }
    @Override
    protected int getEncodingMultiplier(TextPiece textPiece) {
        Charset charset = textPiece.getPieceDescriptor().getCharset();
        if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
            return 2;
        }
        return 1;
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
            SprmBuffer sprmBuffer = null;
            for ( PAPX papx : papxs )
            {
                if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
                if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 )
                    continue;
                if ( sprmBuffer == null ) {
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java
 package org.apache.poi.hwpf.model;
 import java.nio.charset.Charset;
 import org.apache.poi.util.BitField;
 import org.apache.poi.util.BitFieldFactory;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.StringUtil;
@Internal
 public final class PieceDescriptor
   private static BitField fCopied = BitFieldFactory.getInstance(0x04);
  int fc;
  PropertyModifier prm;
  boolean unicode;
  boolean unicode = false;
  private final Charset charset;
  public PieceDescriptor(byte[] buf, int offset)
  {
    descriptor = LittleEndian.getShort(buf, offset);
    offset += LittleEndian.SHORT_SIZE;
    fc = LittleEndian.getInt(buf, offset);
    offset += LittleEndian.INT_SIZE;
    prm = new PropertyModifier( LittleEndian.getShort(buf, offset));
    // see if this piece uses unicode.
    if ((fc & 0x40000000) == 0)
    {
        unicode = true;
    }
    else
    {
        unicode = false;
        fc &= ~(0x40000000);//gives me FC in doc stream
        fc /= 2;
    public PieceDescriptor(byte[] buf, int offset) {
        this(buf, offset, null);
    }
    /**
     *
     * This initializer should only be used for HWPFOldDocuments.
     *
     * @param buf
     * @param offset
     * @param charset which charset to use if this is not unicode
     */
  public PieceDescriptor(byte[] buf, int offset, Charset charset) {
      descriptor = LittleEndian.getShort(buf, offset);
      offset += LittleEndian.SHORT_SIZE;
      fc = LittleEndian.getInt(buf, offset);
      offset += LittleEndian.INT_SIZE;
      prm = new PropertyModifier(LittleEndian.getShort(buf, offset));
      if (charset == null) {
        // see if this piece uses unicode.
        //From the documentation: If the second most significant bit
          //is clear, then this indicates the actual file offset of the Unicode character (two bytes). If the
          //second most significant bit is set, then the actual address of the codepage-1252
          //compressed version of the Unicode character (one byte), is actually at the offset indicated
          //by clearing this bit and dividing by two.
        if ((fc & 0x40000000) == 0) {
          unicode = true;
          this.charset = null;
        } else {
          unicode = false;
          fc &= ~(0x40000000);//gives me FC in doc stream
          fc /= 2;
          this.charset = StringUtil.WIN_1252;
        }
      } else {
          if (charset == StringUtil.UTF16LE) {
              unicode = true;
          }
          this.charset = charset;
      }
  }
  public int getFilePosition()
    return unicode;
  }
    /**
     *
     * @return charset to use if this is not a Unicode PieceDescriptor
     * this can be <code>null</code>
     */
  public Charset getCharset() {
    return charset;
  }
    public PropertyModifier getPrm()
    {
        return prm;
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
 import java.nio.charset.Charset;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.StringUtil;
 /**
 * Lightweight representation of a text piece.
     * @param start Beginning offset in main document stream, in characters.
     * @param end   Ending offset in main document stream, in characters.
     * @param text  The raw bytes of our text
     * @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)}
     * instead
     */
    public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
     * Create the StringBuilder from the text and unicode flag
     */
    private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
        String str = new String(text, Charset.forName(pd.isUnicode() ? "UTF-16LE" : "Cp1252"));
        byte[] textBuffer = text;
        if (StringUtil.BIG5.equals(pd.getCharset())) {
            String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString();
            return new StringBuilder(txt);
        }
        String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
        return new StringBuilder(str);
    }
        return "TextPiece from " + getStart() + " to " + getEnd() + " ("
                + getPieceDescriptor() + ")";
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
            System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
            // And now build the piece
            final TextPiece newTextPiece = new TextPiece(nodeStartChars, nodeEndChars, buf,
            final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf,
                    pieces[x]);
            _textPieces.add(newTextPiece);
        Collections.sort(_textPiecesFCOrder, new FCComparator());
    }
    protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) {
        return new TextPiece(nodeStartChars, nodeEndChars, buf, pd);
    }
    public void add(TextPiece piece) {
        _textPieces.add(piece);
        _textPiecesFCOrder.add(piece);
            if (rangeStartBytes > rangeEndBytes)
                continue;
            final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
            final int encodingMultiplier = getEncodingMultiplier(textPiece);
            final int rangeStartCp = textPiece.getStart()
                    + (rangeStartBytes - tpStart) / encodingMultiplier;
        return result.toArray(new int[result.size()][]);
    }
    protected int getEncodingMultiplier(TextPiece textPiece) {
        return textPiece.isUnicode() ? 2 : 1;
    }
    public int getCpMin() {
        return _cpMin;
    }
        return textPlex.toByteArray();
    }
    private static class FCComparator implements Comparator<TextPiece>, Serializable {
    protected static class FCComparator implements Comparator<TextPiece>, Serializable {
        public int compare(TextPiece textPiece, TextPiece textPiece1) {
            if (textPiece.getPieceDescriptor().fc > textPiece1
                    .getPieceDescriptor().fc) {
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java
 package org.apache.poi.hwpf.usermodel;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.HWPFOldDocument;
 import org.apache.poi.hwpf.model.CHPX;
 import org.apache.poi.hwpf.model.FFData;
 import org.apache.poi.hwpf.model.Ffn;
  public String getFontName()
  {
    if (_doc instanceof HWPFOldDocument) {
      return ((HWPFOldDocument) _doc).getOldFontTable().getMainFont(_props.getFtcAscii());
    }
    if (_doc.getFontTable() == null)
      // old word format
      return null;
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java
 ==================================================================== */
 package org.apache.poi.hwpf.converter;
 import java.io.File;
 import java.io.FilenameFilter;
 import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import static org.junit.Assert.assertNotNull;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 import java.io.File;
 import java.io.FilenameFilter;
 import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import org.apache.poi.POIDataSamples;
 import org.apache.poi.hwpf.HWPFDocumentCore;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 import static org.junit.Assert.assertNotNull;
@RunWith(Parameterized.class)
 public class TestWordToConverterSuite
 {
     * YK: a quick hack to exclude failing documents from the suite.
     */
    private static List<String> failingFiles = Arrays
            .asList( "ProblemExtracting.doc" );
            .asList( "ProblemExtracting.doc",
                    "Bug50955.doc" //basic extraction works,
                                    // but these extractors modify the document,
                                    // which is a no-go for this Word 6.0 file
            );
    @Parameterized.Parameters(name="{index}: {0}")
    public static Iterable<Object[]> files() {
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
 *  against HWPF
 */
 public class TestBugs{
    private static final POILogger logger = POILogFactory.getLogger(TestBugs.class);
    public static void assertEqualsIgnoreNewline(String expected, String actual )
        hwpfDocument.getPicturesTable().getAllPictures();
    }
    /**
     * [FAILING] Bug 50955 - error while retrieving the text file
     */
    @Test(expected=IllegalStateException.class)
    public void test50955() throws IOException {
        getTextOldFile("Bug50955.doc");
    }
    /**
     * [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
 package org.apache.poi.hwpf.usermodel;
 import static org.apache.poi.POITestCase.assertContains;
 import static org.junit.Assert.assertEquals;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import org.apache.poi.OldFileFormatException;
 import org.apache.poi.hwmf.record.HwmfFont;
 import org.apache.poi.hwpf.HWPFOldDocument;
 import org.apache.poi.hwpf.HWPFTestCase;
 import org.apache.poi.hwpf.HWPFTestDataSamples;
 import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.model.OldFontTable;
 import org.junit.Test;
 /**
        assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
        // Normal, superscript for 4th, normal
        assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
        doc.close();
    }
                doc.getRange().getParagraph(1).text());
        doc.close();
    }
    @Test
    public void testDefaultCodePageEncoding() throws IOException {
        HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
        Word6Extractor ex = new Word6Extractor(doc);
        String txt = ex.getText();
        assertContains(txt, "BERTHOD");
        assertContains(txt, "APPLICOLOR");
        assertContains(txt, "les meilleurs");
        assertContains(txt, "GUY LECOLE");
    }
    @Test
    public void testCodePageBug50955() throws IOException {
        //windows 1251
        HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
        Word6Extractor ex = new Word6Extractor(doc);
        StringBuilder sb = new StringBuilder();
        for (String p : ex.getParagraphText()) {
            sb.append(p);
        }
        assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings!
    }
    @Test
    public void testCodePageBug60936() throws IOException {
        //windows 1250 -- this test file was generated with OpenOffice
        //see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration
        HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc");
        Word6Extractor ex = new Word6Extractor(doc);
        StringBuilder sb = new StringBuilder();
        for (String p : ex.getParagraphText()) {
            sb.append(p);
        }
        assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings!
    }
    @Test
    public void testOldFontTableEncoding() throws IOException {
        HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
        OldFontTable oldFontTable = doc.getOldFontTable();
        assertEquals(5, oldFontTable.getFontNames().length);
        assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName());
        assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5"));
        assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName());
        doc.close();
    }
    @Test
    public void testOldFontTableAltName() throws IOException {
        HWPFOldDocument doc  = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc");
        OldFontTable oldFontTable = doc.getOldFontTable();
        assertEquals(5, oldFontTable.getFontNames().length);
        assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName());
        assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName());
        assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName());
        assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName());
    }
    @Test
    public void test51944() throws IOException {
        HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
        Word6Extractor ex = new Word6Extractor(doc);
        StringBuilder sb = new StringBuilder();
        for (String p : ex.getParagraphText()) {
            sb.append(p.replaceAll("[\r\n]+", "\n"));
        }
        String txt = sb.toString();
        assertContains(txt, "Post and Fax");
        assertContains(txt, "also maintain");//this is at a critical juncture
        assertContains(txt, "which are available for");//this too
        //TODO: figure out why these two aren't passing
 //        assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
 //        assertContains(txt, "We are able to");//not sure if we can get this easily?
    }
 }
--- a/test-data/document/Bug60936.doc
+++ b/test-data/document/Bug60936.doc
--- a/test-data/document/Bug60942.doc
+++ b/test-data/document/Bug60942.doc
--- a/test-data/document/Bug60942b.doc
+++ b/test-data/document/Bug60942b.doc