diff options
-rw-r--r-- | poi-scratchpad/src/main/java/org/apache/poi/hwpf/model/NilPICFAndBinData.java | 30 | ||||
-rw-r--r-- | poi-scratchpad/src/test/java/org/apache/poi/hwpf/converter/TestWordToTextConverter.java | 53 |
2 files changed, 68 insertions, 15 deletions
diff --git a/poi-scratchpad/src/main/java/org/apache/poi/hwpf/model/NilPICFAndBinData.java b/poi-scratchpad/src/main/java/org/apache/poi/hwpf/model/NilPICFAndBinData.java index 3377c8cd9c..c20d39ff66 100644 --- a/poi-scratchpad/src/main/java/org/apache/poi/hwpf/model/NilPICFAndBinData.java +++ b/poi-scratchpad/src/main/java/org/apache/poi/hwpf/model/NilPICFAndBinData.java @@ -20,56 +20,56 @@ import java.util.Arrays; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.poi.util.IOUtils; import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndianConsts; import static java.lang.Integer.toHexString; import static org.apache.logging.log4j.util.Unbox.box; -public class NilPICFAndBinData -{ - +public class NilPICFAndBinData { private static final Logger LOGGER = LogManager.getLogger(NilPICFAndBinData.class); + // limit the default maximum length of the allocated fields + private static final int MAX_SIZE = 100_000; + private byte[] _binData; - public NilPICFAndBinData( byte[] data, int offset ) - { + public NilPICFAndBinData( byte[] data, int offset ) { fillFields( data, offset ); } - public void fillFields( byte[] data, int offset ) - { + public void fillFields( byte[] data, int offset ) { int lcb = LittleEndian.getInt( data, offset ); int cbHeader = LittleEndian.getUShort( data, offset + LittleEndianConsts.INT_SIZE ); - if ( cbHeader != 0x44 ) - { + if ( cbHeader != 0x44 ) { LOGGER.atWarn().log("NilPICFAndBinData at offset {} cbHeader 0x{} != 0x44", box(offset), toHexString(cbHeader)); } + // make sure these do not cause OOM if passed as invalid or extremely large values + IOUtils.safelyAllocateCheck(lcb, MAX_SIZE); + IOUtils.safelyAllocateCheck(cbHeader, MAX_SIZE); + // skip the 62 ignored bytes int binaryLength = lcb - cbHeader; this._binData = Arrays.copyOfRange(data, offset + cbHeader, offset + cbHeader + binaryLength); } - public byte[] getBinData() - { + public byte[] getBinData() { return _binData; } - public byte[] serialize() - { + public byte[] serialize() { byte[] bs = new byte[_binData.length + 0x44]; LittleEndian.putInt( bs, 0, _binData.length + 0x44 ); System.arraycopy( _binData, 0, bs, 0x44, _binData.length ); return bs; } - public int serialize( byte[] data, int offset ) - { + public int serialize( byte[] data, int offset ) { LittleEndian.putInt( data, offset, _binData.length + 0x44 ); System.arraycopy( _binData, 0, data, offset + 0x44, _binData.length ); return 0x44 + _binData.length; diff --git a/poi-scratchpad/src/test/java/org/apache/poi/hwpf/converter/TestWordToTextConverter.java b/poi-scratchpad/src/test/java/org/apache/poi/hwpf/converter/TestWordToTextConverter.java index 9ffea802cf..64d89486d0 100644 --- a/poi-scratchpad/src/test/java/org/apache/poi/hwpf/converter/TestWordToTextConverter.java +++ b/poi-scratchpad/src/test/java/org/apache/poi/hwpf/converter/TestWordToTextConverter.java @@ -20,10 +20,29 @@ import static org.apache.poi.hwpf.HWPFTestDataSamples.openSampleFile; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import java.io.File; +import java.io.FileInputStream; +import java.io.FilenameFilter; +import java.io.InputStream; +import java.util.Arrays; +import java.util.stream.Stream; + +import org.apache.commons.io.filefilter.SuffixFileFilter; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.poi.EncryptedDocumentException; +import org.apache.poi.POIDataSamples; import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.OldWordFileFormatException; +import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.util.RecordFormatException; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; public class TestWordToTextConverter { + private static final Logger LOG = LogManager.getLogger(WordToTextConverter.class); /** * [FAILING] Bug 47731 - Word Extractor considers text copied from some @@ -60,4 +79,38 @@ public class TestWordToTextConverter { assertNotNull(WordToTextConverter.getText(doc)); } } + + @ParameterizedTest + @MethodSource("files") + void testAllFiles(File file) throws Exception { + LOG.info("Testing " + file); + try (FileInputStream stream = new FileInputStream(file)) { + InputStream is = FileMagic.prepareToCheckMagic(stream); + FileMagic fm = FileMagic.valueOf(is); + + if (fm != FileMagic.OLE2) { + LOG.info("Skip non-doc file " + file); + + return; + } + + try (HWPFDocument doc = new HWPFDocument(is)) { + String foundText = WordToTextConverter.getText(doc); + assertNotNull(foundText); + } catch (OldWordFileFormatException | EncryptedDocumentException | RecordFormatException e) { + // ignored here + } + } + } + + public static Stream<Arguments> files() { + String dataDirName = System.getProperty(POIDataSamples.TEST_PROPERTY, + new File("test-data").exists() ? "test-data" : "../test-data"); + + File[] documents = new File(dataDirName, "document").listFiles( + (FilenameFilter) new SuffixFileFilter(".doc")); + assertNotNull(documents); + + return Arrays.stream(documents).map(Arguments::of); + } } |