From 3680dc8992284310e0f5ef420eb50a2aae479ef8 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Thu, 30 Jun 2016 21:20:23 +0000 Subject: [PATCH] #59739 When testing for a valid NFIB in a FileInformationBlock, accept common "nearby" values as found from CommonCrawl git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1750866 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/hwpf/model/FileInformationBlock.java | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java b/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java index 01b090830c..d9905f8f65 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java @@ -143,36 +143,45 @@ public final class FileInformationBlock implements Cloneable private void assertCbRgFcLcb() { - switch ( getNFib() ) + int nfib = getNFib(); + String nfibHex = String.format("%04X", nfib); + + // Note - CommonCrawl shows there's more variation in these than + // the documentation suggests, so accept common clusters around + // the "correct" value as well + switch ( nfib ) { - case 0x00C1: - assertCbRgFcLcb( "0x00C1", 0x005D, "0x005D", _cbRgFcLcb ); + case 0x0071: + // Found in CommonCrawl corpus but not in the docs... break; - case 0x00D9: - assertCbRgFcLcb( "0x00D9", 0x006C, "0x006C", _cbRgFcLcb ); + case 0x00BE: + case 0x00BF: + case 0x00C0: + case 0x00C1: // Docs "official" + case 0x00C2: + case 0x00C3: + assertCbRgFcLcb(nfibHex, 0x005D, "0x005D", _cbRgFcLcb ); + break; + case 0x00D8: + case 0x00D9: // Docs "official" + assertCbRgFcLcb(nfibHex, 0x006C, "0x006C", _cbRgFcLcb ); break; case 0x0101: assertCbRgFcLcb( "0x0101", 0x0088, "0x0088", _cbRgFcLcb ); break; - case 0x010C: - assertCbRgFcLcb( "0x010C", 0x00A4, "0x00A4", _cbRgFcLcb ); + // TODO Is CommonCrawl 265 = 0x109 the one above or below? + case 0x010B: + case 0x010C: // Docs "official" + assertCbRgFcLcb(nfibHex, 0x00A4, "0x00A4", _cbRgFcLcb ); break; case 0x0112: assertCbRgFcLcb( "0x0112", 0x00B7, "0x00B7", _cbRgFcLcb ); break; default: - /* -When running with the large CommonCrawl corpus we found the following ids in documents that are processed fine: -java.lang.IllegalStateException: Invalid file format version number: 113 -java.lang.IllegalStateException: Invalid file format version number: 191 -java.lang.IllegalStateException: Invalid file format version number: 192 -java.lang.IllegalStateException: Invalid file format version number: 194 -java.lang.IllegalStateException: Invalid file format version number: 195 -java.lang.IllegalStateException: Invalid file format version number: 216 -java.lang.IllegalStateException: Invalid file format version number: 265 -java.lang.IllegalStateException: Invalid file format version number: 267 + /* The Word spec has a much smaller list of "valid" values + * to what the large CommonCrawl corpus contains! */ - logger.log(POILogger.WARN, "Invalid file format version number: " + getNFib()); + logger.log(POILogger.WARN, "Invalid file format version number: " + nfib + "("+nfibHex+")"); } } -- 2.39.5