From 8cd8659010e6787f2cca2f9ce725b93e7994209f Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Fri, 17 Sep 2010 13:46:11 +0000 Subject: [PATCH] Fix support for sections in old word 6 / word 95 files Improve unit testing for HWPFOldDocument Sprm fix also improves some HWPFDocument files too! git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@998131 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../poi/hwpf/extractor/WordExtractor.java | 22 ++-- .../poi/hwpf/model/OldSectionTable.java | 29 ++++- .../apache/poi/hwpf/model/TextPieceTable.java | 2 + .../apache/poi/hwpf/sprm/SprmIterator.java | 3 +- .../org/apache/poi/hwpf/AllHWPFTests.java | 50 ++++++- .../apache/poi/hwpf/HWPFTestDataSamples.java | 9 ++ .../poi/hwpf/extractor/TestWordExtractor.java | 16 ++- .../hwpf/usermodel/TestHWPFOldDocument.java | 122 ++++++++++++++++++ test-data/document/Word6_sections.doc | Bin 0 -> 6656 bytes 10 files changed, 233 insertions(+), 21 deletions(-) create mode 100644 src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java create mode 100644 test-data/document/Word6_sections.doc diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 1332578fff..0da406cdc3 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument) 49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index eaf70c116a..1c6ed2b1d2 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -245,22 +245,22 @@ public final class WordExtractor extends POIOLE2TextExtractor { * but slightly slower than getTextFromPieces(). */ public String getText() { - StringBuffer ret = new StringBuffer(); + StringBuffer ret = new StringBuffer(); - ret.append(getHeaderText()); + ret.append(getHeaderText()); - ArrayList text = new ArrayList(); - text.addAll(Arrays.asList(getParagraphText())); - text.addAll(Arrays.asList(getFootnoteText())); - text.addAll(Arrays.asList(getEndnoteText())); + ArrayList text = new ArrayList(); + text.addAll(Arrays.asList(getParagraphText())); + text.addAll(Arrays.asList(getFootnoteText())); + text.addAll(Arrays.asList(getEndnoteText())); - for(String p : text) { - ret.append(p); - } + for(String p : text) { + ret.append(p); + } - ret.append(getFooterText()); + ret.append(getFooterText()); - return ret.toString(); + return ret.toString(); } /** diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java index dc992c69b6..d16edb16be 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java @@ -34,6 +34,7 @@ public final class OldSectionTable extends SectionTable TextPieceTable tpt) { PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12); + CharIsBytes charConv = new CharIsBytes(tpt); int length = sedPlex.length(); @@ -49,7 +50,7 @@ public final class OldSectionTable extends SectionTable // check for the optimization if (fileOffset == 0xffffffff) { - _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0])); + _sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0])); } else { @@ -58,8 +59,32 @@ public final class OldSectionTable extends SectionTable byte[] buf = new byte[sepxSize]; fileOffset += LittleEndian.SHORT_SIZE; System.arraycopy(documentStream, fileOffset, buf, 0, buf.length); - _sections.add(new SEPX(sed, startAt, endAt, tpt, buf)); + _sections.add(new SEPX(sed, startAt, endAt, charConv, buf)); } } } + + private static class CharIsBytes implements CharIndexTranslator { + private TextPieceTable tpt; + private CharIsBytes(TextPieceTable tpt) { + this.tpt = tpt; + } + + public int getCharIndex(int bytePos, int startCP) { + return bytePos; + } + public int getCharIndex(int bytePos) { + return bytePos; + } + + public boolean isIndexInTable(int bytePos) { + return tpt.isIndexInTable(bytePos); + } + public int lookIndexBackward(int bytePos) { + return tpt.lookIndexBackward(bytePos); + } + public int lookIndexForward(int bytePos) { + return tpt.lookIndexForward(bytePos); + } + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java index ccd7cc10f3..43bfcf8120 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java @@ -216,6 +216,8 @@ public class TextPieceTable implements CharIndexTranslator { if (bytePos< pieceStart || bytePos > pieceEnd) { toAdd = bytesLength; + } else if (bytePos > pieceStart && bytePos < pieceEnd) { + toAdd = (bytePos - pieceStart); } else { toAdd = bytesLength - (pieceEnd - bytePos); } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java b/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java index 1a6f23fdfb..9bfff7fb6a 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java @@ -37,7 +37,8 @@ public final class SprmIterator public boolean hasNext() { - return _offset < _grpprl.length; + // A Sprm is at least 2 bytes long + return _offset < (_grpprl.length-1); } public SprmOperation next() diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java b/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java index d739136e76..6b4dd4514a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java @@ -20,22 +20,68 @@ package org.apache.poi.hwpf; import junit.framework.Test; import junit.framework.TestSuite; -import org.apache.poi.hwpf.model.*; +import org.apache.poi.hwpf.extractor.TestWordExtractor; +import org.apache.poi.hwpf.extractor.TestWordExtractorBugs; +import org.apache.poi.hwpf.model.TestCHPBinTable; +import org.apache.poi.hwpf.model.TestDocumentProperties; +import org.apache.poi.hwpf.model.TestFileInformationBlock; +import org.apache.poi.hwpf.model.TestFontTable; +import org.apache.poi.hwpf.model.TestListTables; +import org.apache.poi.hwpf.model.TestPAPBinTable; +import org.apache.poi.hwpf.model.TestPlexOfCps; +import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable; +import org.apache.poi.hwpf.model.TestSavedByTable; +import org.apache.poi.hwpf.model.TestSectionTable; +import org.apache.poi.hwpf.model.TestStyleSheet; +import org.apache.poi.hwpf.model.TestTextPieceTable; +import org.apache.poi.hwpf.usermodel.TestBug46610; +import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument; +import org.apache.poi.hwpf.usermodel.TestHeaderStories; +import org.apache.poi.hwpf.usermodel.TestPictures; +import org.apache.poi.hwpf.usermodel.TestProblems; +import org.apache.poi.hwpf.usermodel.TestRange; +import org.apache.poi.hwpf.usermodel.TestRangeDelete; +import org.apache.poi.hwpf.usermodel.TestRangeInsertion; +import org.apache.poi.hwpf.usermodel.TestRangeProperties; +import org.apache.poi.hwpf.usermodel.TestRangeReplacement; +import org.apache.poi.hwpf.usermodel.TestShapes; public final class AllHWPFTests { public static Test suite() { TestSuite suite = new TestSuite(AllHWPFTests.class.getName()); + + suite.addTestSuite(TestHWPFPictures.class); + suite.addTestSuite(TestHWPFRangeParts.class); + + suite.addTestSuite(TestWordExtractor.class); + suite.addTestSuite(TestWordExtractorBugs.class); + suite.addTestSuite(TestCHPBinTable.class); suite.addTestSuite(TestDocumentProperties.class); suite.addTestSuite(TestFileInformationBlock.class); suite.addTestSuite(TestFontTable.class); + suite.addTestSuite(TestListTables.class); suite.addTestSuite(TestPAPBinTable.class); suite.addTestSuite(TestPlexOfCps.class); + suite.addTestSuite(TestRevisionMarkAuthorTable.class); + suite.addTestSuite(TestSavedByTable.class); suite.addTestSuite(TestSectionTable.class); suite.addTestSuite(TestStyleSheet.class); suite.addTestSuite(TestTextPieceTable.class); - suite.addTestSuite(TestListTables.class); + + suite.addTestSuite(TestBug46610.class); + suite.addTestSuite(TestHeaderStories.class); + suite.addTestSuite(TestHWPFOldDocument.class); + suite.addTestSuite(TestPictures.class); + suite.addTestSuite(TestProblems.class); + suite.addTestSuite(TestRange.class); + suite.addTestSuite(TestRangeDelete.class); + suite.addTestSuite(TestRangeInsertion.class); + suite.addTestSuite(TestRangeProperties.class); + suite.addTestSuite(TestRangeReplacement.class); + suite.addTestSuite(TestShapes.class); + return suite; } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java b/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java index 706fe55fa1..2c5253d9cf 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java @@ -17,6 +17,7 @@ package org.apache.poi.hwpf; import org.apache.poi.POIDataSamples; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import java.io.*; @@ -30,6 +31,14 @@ public class HWPFTestDataSamples { throw new RuntimeException(e); } } + public static HWPFOldDocument openOldSampleFile(String sampleFileName) { + try { + InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName); + return new HWPFOldDocument(new POIFSFileSystem(is)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } /** * Writes a spreadsheet to a ByteArrayOutputStream and reads it back * from a ByteArrayInputStream.

diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index ea69824daa..1ef21db3a5 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -52,7 +52,7 @@ public final class TestWordExtractor extends TestCase { // Well behaved document private WordExtractor extractor; - // Corrupted document - can't do paragraph based stuff + // Slightly iffy document private WordExtractor extractor2; // A word doc embeded in an excel file private String filename3; @@ -93,8 +93,11 @@ public final class TestWordExtractor extends TestCase { assertEquals(p_text1[i], text[i]); } - // On second one, should fall back - assertEquals(1, extractor2.getParagraphText().length); + // Lots of paragraphs with only a few lines in them + assertEquals(24, extractor2.getParagraphText().length); + assertEquals("as d\r\n", extractor2.getParagraphText()[16]); + assertEquals("as d\r\n", extractor2.getParagraphText()[17]); + assertEquals("as d\r\n", extractor2.getParagraphText()[18]); } /** @@ -103,8 +106,11 @@ public final class TestWordExtractor extends TestCase { public void testGetText() { assertEquals(p_text1_block, extractor.getText()); - // On second one, should fall back to text piece - assertEquals(extractor2.getTextFromPieces(), extractor2.getText()); + // For the 2nd, should give similar answers for + // the two methods, differing only in line endings + assertEquals( + extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), + extractor2.getText().replaceAll("[\\r\\n]", "")); } /** diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java new file mode 100644 index 0000000000..fc20c7157f --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java @@ -0,0 +1,122 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.usermodel; + +import org.apache.poi.OldFileFormatException; +import org.apache.poi.hwpf.HWPFOldDocument; +import org.apache.poi.hwpf.HWPFTestCase; +import org.apache.poi.hwpf.HWPFTestDataSamples; + +/** + * Tests for Word 6 and Word 95 support + */ +public final class TestHWPFOldDocument extends HWPFTestCase { + /** + * Test a simple Word 6 document + */ + public void testWord6() throws Exception { + // Can't open as HWPFDocument + try { + HWPFTestDataSamples.openSampleFile("Word6.doc"); + fail("Shouldn't be openable"); + } catch(OldFileFormatException e) {} + + // Open + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc"); + + // Check + assertEquals(1, doc.getRange().numSections()); + assertEquals(1, doc.getRange().numParagraphs()); + assertEquals(1, doc.getRange().numCharacterRuns()); + + assertEquals( + "The quick brown fox jumps over the lazy dog\r", + doc.getRange().getParagraph(0).text() + ); + } + + /** + * Test a simple Word 95 document + */ + public void testWord95() throws Exception { + // Can't open as HWPFDocument + try { + HWPFTestDataSamples.openSampleFile("Word95.doc"); + fail("Shouldn't be openable"); + } catch(OldFileFormatException e) {} + + // Open + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc"); + + // Check + assertEquals(1, doc.getRange().numSections()); + assertEquals(7, doc.getRange().numParagraphs()); + + assertEquals( + "The quick brown fox jumps over the lazy dog\r", + doc.getRange().getParagraph(0).text() + ); + assertEquals("\r", doc.getRange().getParagraph(1).text()); + assertEquals( + "Paragraph 2\r", + doc.getRange().getParagraph(2).text() + ); + assertEquals("\r", doc.getRange().getParagraph(3).text()); + assertEquals( + "Paragraph 3. Has some RED text and some " + + "BLUE BOLD text in it.\r", + doc.getRange().getParagraph(4).text() + ); + assertEquals("\r", doc.getRange().getParagraph(5).text()); + assertEquals( + "Last (4th) paragraph.\r", + doc.getRange().getParagraph(6).text() + ); + + assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns()); + assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns()); + assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns()); + assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns()); + // Normal, red, normal, blue+bold, normal + assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns()); + assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns()); + // Normal, superscript for 4th, normal + assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns()); + } + + /** + * Test a word document that has sections, + * as well as the usual paragraph stuff. + */ + public void testWord6Sections() throws Exception { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc"); + + assertEquals(3, doc.getRange().numSections()); + assertEquals(6, doc.getRange().numParagraphs()); + + assertEquals( + "This is a test.\r", + doc.getRange().getParagraph(0).text() + ); + assertEquals("\r", doc.getRange().getParagraph(1).text()); + assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line? + assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text()); + assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line? + assertEquals("\r", doc.getRange().getParagraph(5).text()); + } +} diff --git a/test-data/document/Word6_sections.doc b/test-data/document/Word6_sections.doc new file mode 100644 index 0000000000000000000000000000000000000000..e7ac98935676be7ea5ed06f7f77b1edda70add4a GIT binary patch literal 6656 zcmeHL&2Jk;6n|^46T6LF$0Wf`3S9~~ZJM~{Lsca%acdA2DV1uAa3G+yy~a!9-D?jZ}T zySnl_l<^WH0KfA=APU3)?%VCw_t4V%ss^yYSu*iwlR{6Ef!d-Ac+G%u(b0+P`wRL9`k%x2#5$xAD-v02`+L>DN^LL|kc=DTHkAn6F z!1o*X9|!n;n+4d%=K%Ke7XkM53c&t;9$+PJt~ZVr_-@Lf3-JTj51|b(aAO4Dnzr%Q1GuRw$`w-@%YdN#n4_?8d}-bO(Pp0 zijUAoZ)UH4pZnO5KQZJ_4f!)eUd5%enR*FV&}Ql?UFu22yE_PbJPD+LLjdo26IjRJ zUw3!rzVsclKxj##@B$9~2o4?b{)bU}J}`wg`$5yu&#hc#J3Z~#dioRhVgX#w0|^{p zQQSJuiHKUny2(FVA;i3cd9A80E!s}J@&rvw2`bS-n96BVW6^Vbxwn@LPf~^kpVicgZoJC3`W_nngT|IADh<#= zgv$F!p%Y2wQz%x{mTjeER*)t}5%VG52Pen=N0O41l150L(;GJ8CC&^S`3G43N%cG* zUFK5Jdc~@tXWqL8Ui&$g$wnPxk$S+%#yQob&)dY zSQ(zvOPZy!;z~h-0XL0jHtNnc0m&;#zckfoxHFx=qNM0MX`=6-aZN0G?c&WUFO7?1 z1;kY%IdXzsnclh#W)Z$}%H&X)!etUm4p_E|pIKDs`j6K?$WA0z-$VR7z4lcev`8E6 zCD5R`P?8npoRG(Ith^N+-Tm@EvA}=%{YN-zTMl`rWwLj!aDs&*(Mf!Nsw2lbgVXIb zhq#U36ugc7jb(})HsIKj%O_#p#oIHlfD#+$`@g?+mK?12-^~9v$i