123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.hwpf;
-
- import java.io.File;
- import java.io.IOException;
- import java.io.OutputStream;
- import java.io.UnsupportedEncodingException;
- import java.nio.charset.Charset;
-
- import org.apache.poi.hpsf.CustomProperties;
- import org.apache.poi.hpsf.DocumentSummaryInformation;
- import org.apache.poi.hpsf.Section;
- import org.apache.poi.hwmf.record.HwmfFont;
- import org.apache.poi.hwpf.model.ComplexFileTable;
- import org.apache.poi.hwpf.model.FontTable;
- import org.apache.poi.hwpf.model.OldCHPBinTable;
- import org.apache.poi.hwpf.model.OldComplexFileTable;
- import org.apache.poi.hwpf.model.OldFfn;
- import org.apache.poi.hwpf.model.OldFontTable;
- import org.apache.poi.hwpf.model.OldPAPBinTable;
- import org.apache.poi.hwpf.model.OldSectionTable;
- import org.apache.poi.hwpf.model.OldTextPieceTable;
- import org.apache.poi.hwpf.model.PieceDescriptor;
- import org.apache.poi.hwpf.model.TextPiece;
- import org.apache.poi.hwpf.model.TextPieceTable;
- import org.apache.poi.hwpf.usermodel.Range;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.CodePageUtil;
- import org.apache.poi.util.LittleEndian;
- import org.apache.poi.util.NotImplemented;
- import org.apache.poi.util.POILogFactory;
- import org.apache.poi.util.POILogger;
- import org.apache.poi.util.StringUtil;
-
- /**
- * Provides very simple support for old (Word 6 / Word 95)
- * files.
- */
- public class HWPFOldDocument extends HWPFDocumentCore {
-
- private static final POILogger logger = POILogFactory
- .getLogger( HWPFOldDocument.class );
-
- private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
-
- private OldTextPieceTable tpt;
-
- private StringBuilder _text;
-
- private final OldFontTable fontTable;
- private final Charset guessedCharset;
-
- public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
- this(fs.getRoot());
- }
-
- public HWPFOldDocument(DirectoryNode directory)
- throws IOException {
- super(directory);
-
- // Where are things?
- int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
- int sedTableSize = LittleEndian.getInt(_mainStream, 0x8c);
- int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
- int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
- int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
- int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
- int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
- int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
-
- fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize);
- //TODO: figure out how to map runs/text pieces to fonts
- //for now, if there's a non standard codepage in one of the fonts
- //assume that the doc is in that codepage.
- guessedCharset = guessCodePage(fontTable);
-
- int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
-
- // We need to get hold of the text that makes up the
- // document, which might be regular or fast-saved
- ComplexFileTable cft = null;
- if(_fib.getFibBase().isFComplex()) {
- cft = new OldComplexFileTable(
- _mainStream, _mainStream,
- complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset
- );
- tpt = (OldTextPieceTable)cft.getTextPieceTable();
-
- } else {
- // TODO Discover if these older documents can ever hold Unicode Strings?
- // (We think not, because they seem to lack a Piece table)
- //
- // What we have here is a wretched hack. We need to figure out
- // how to get the correct charset for the doc.
- TextPiece tp = null;
- try {
- tp = buildTextPiece(guessedCharset);
- } catch (IllegalStateException e) {
- //if there was a problem with the guessed charset and the length of the
- //textpiece, back off to win1252. This is effectively what we used to do.
- tp = buildTextPiece(StringUtil.WIN_1252);
- logger.log(POILogger.WARN, "Error with "+guessedCharset +". Backing off to Windows-1252");
- }
- tpt.add(tp);
-
- }
- _text = tpt.getText();
-
- // Now we can fetch the character and paragraph properties
- _cbt = new OldCHPBinTable(
- _mainStream, chpTableOffset, chpTableSize,
- _fib.getFibBase().getFcMin(), tpt
- );
- _pbt = new OldPAPBinTable(
- _mainStream, papTableOffset, papTableSize,
- _fib.getFibBase().getFcMin(), tpt
- );
- _st = new OldSectionTable(
- _mainStream, sedTableOffset, sedTableSize,
- _fib.getFibBase().getFcMin(), tpt
- );
-
- /*
- * in this mode we preserving PAPX/CHPX structure from file, so text may
- * miss from output, and text order may be corrupted
- */
- boolean preserveBinTables = false;
- try
- {
- preserveBinTables = Boolean.parseBoolean( System
- .getProperty( HWPFDocument.PROPERTY_PRESERVE_BIN_TABLES ) );
- }
- catch ( Exception exc )
- {
- // ignore;
- }
-
- if ( !preserveBinTables )
- {
- _cbt.rebuild( cft );
- _pbt.rebuild( _text, cft );
- }
- }
-
- /**
- *
- * @param guessedCharset charset that we think this is
- * @return a new text piece
- * @throws IllegalStateException if the length isn't correct
- */
- private TextPiece buildTextPiece(Charset guessedCharset) throws IllegalStateException {
- PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
- pd.setFilePosition(_fib.getFibBase().getFcMin());
-
- // Generate a single Text Piece Table, with a single Text Piece
- // which covers all the (8 bit only) text in the file
- tpt = new OldTextPieceTable();
- byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
- System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
-
- int numChars = textData.length;
- if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
- numChars /= 2;
- }
-
- return new TextPiece(
- 0, numChars, textData, pd
- );
-
- }
-
-
- /**
- * Try to get the code page from various areas of the document.
- * Start with the DocumentSummaryInformation, back off to the section info,
- * finally try the charset information from the font table.
- *
- * Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi
- * appears here.
- *
- * @param fontTable
- * @return The detected Charset from the old font table
- */
- private Charset guessCodePage(OldFontTable fontTable) {
- // pick the first non-default, non-symbol charset
- for (OldFfn oldFfn : fontTable.getFontNames()) {
- HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
- if (wmfCharset != null &&
- wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
- wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
- wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
- return wmfCharset.getCharset();
- }
- }
- logger.log(POILogger.WARN, "Couldn't find a defined charset; backing off to cp1252");
- //if all else fails
- return DEFAULT_CHARSET;
- }
-
- public Range getOverallRange()
- {
- // Life is easy when we have no footers, headers or unicode!
- return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this );
- }
-
- /**
- * Use {@link #getOldFontTable()} instead!!!
- * This always throws an IllegalArgumentException.
- *
- * @return nothing
- * @throws UnsupportedOperationException
- */
- @Override
- @NotImplemented
- public FontTable getFontTable() {
- throw new UnsupportedOperationException("Use getOldFontTable instead.");
- }
-
- public OldFontTable getOldFontTable() {
- return fontTable;
- }
- public Range getRange()
- {
- return getOverallRange();
- }
-
- public TextPieceTable getTextTable()
- {
- return tpt;
- }
-
- @Override
- public StringBuilder getText()
- {
- return _text;
- }
-
- @Override
- public void write() throws IOException {
- throw new IllegalStateException("Writing is not available for the older file formats");
- }
- @Override
- public void write(File out) throws IOException {
- throw new IllegalStateException("Writing is not available for the older file formats");
- }
- @Override
- public void write(OutputStream out) throws IOException {
- throw new IllegalStateException("Writing is not available for the older file formats");
- }
-
- /**
- * As a rough heuristic (total hack), read through the HPSF,
- * then read through the font table, and take the first
- * non-default, non-ansi, non-symbol
- * font's charset and return that.
- *
- * Once we figure out how to link a font to a text piece, we should
- * use the font information per text piece.
- *
- * @return charset
- */
- public Charset getGuessedCharset() {
- return guessedCharset;
- }
-
- }
|