From: Nick Burch Date: Fri, 17 Sep 2010 13:46:11 +0000 (+0000) Subject: Fix support for sections in old word 6 / word 95 files X-Git-Tag: REL_3_7_BETA3~16 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=8cd8659010e6787f2cca2f9ce725b93e7994209f;p=poi.git Fix support for sections in old word 6 / word 95 files Improve unit testing for HWPFOldDocument Sprm fix also improves some HWPFDocument files too! git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@998131 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 1332578fff..0da406cdc3 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument) 49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index eaf70c116a..1c6ed2b1d2 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -245,22 +245,22 @@ public final class WordExtractor extends POIOLE2TextExtractor { * but slightly slower than getTextFromPieces(). */ public String getText() { - StringBuffer ret = new StringBuffer(); + StringBuffer ret = new StringBuffer(); - ret.append(getHeaderText()); + ret.append(getHeaderText()); - ArrayList text = new ArrayList(); - text.addAll(Arrays.asList(getParagraphText())); - text.addAll(Arrays.asList(getFootnoteText())); - text.addAll(Arrays.asList(getEndnoteText())); + ArrayList text = new ArrayList(); + text.addAll(Arrays.asList(getParagraphText())); + text.addAll(Arrays.asList(getFootnoteText())); + text.addAll(Arrays.asList(getEndnoteText())); - for(String p : text) { - ret.append(p); - } + for(String p : text) { + ret.append(p); + } - ret.append(getFooterText()); + ret.append(getFooterText()); - return ret.toString(); + return ret.toString(); } /** diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java index dc992c69b6..d16edb16be 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java @@ -34,6 +34,7 @@ public final class OldSectionTable extends SectionTable TextPieceTable tpt) { PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12); + CharIsBytes charConv = new CharIsBytes(tpt); int length = sedPlex.length(); @@ -49,7 +50,7 @@ public final class OldSectionTable extends SectionTable // check for the optimization if (fileOffset == 0xffffffff) { - _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0])); + _sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0])); } else { @@ -58,8 +59,32 @@ public final class OldSectionTable extends SectionTable byte[] buf = new byte[sepxSize]; fileOffset += LittleEndian.SHORT_SIZE; System.arraycopy(documentStream, fileOffset, buf, 0, buf.length); - _sections.add(new SEPX(sed, startAt, endAt, tpt, buf)); + _sections.add(new SEPX(sed, startAt, endAt, charConv, buf)); } } } + + private static class CharIsBytes implements CharIndexTranslator { + private TextPieceTable tpt; + private CharIsBytes(TextPieceTable tpt) { + this.tpt = tpt; + } + + public int getCharIndex(int bytePos, int startCP) { + return bytePos; + } + public int getCharIndex(int bytePos) { + return bytePos; + } + + public boolean isIndexInTable(int bytePos) { + return tpt.isIndexInTable(bytePos); + } + public int lookIndexBackward(int bytePos) { + return tpt.lookIndexBackward(bytePos); + } + public int lookIndexForward(int bytePos) { + return tpt.lookIndexForward(bytePos); + } + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java index ccd7cc10f3..43bfcf8120 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java @@ -216,6 +216,8 @@ public class TextPieceTable implements CharIndexTranslator { if (bytePos< pieceStart || bytePos > pieceEnd) { toAdd = bytesLength; + } else if (bytePos > pieceStart && bytePos < pieceEnd) { + toAdd = (bytePos - pieceStart); } else { toAdd = bytesLength - (pieceEnd - bytePos); } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java b/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java index 1a6f23fdfb..9bfff7fb6a 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java @@ -37,7 +37,8 @@ public final class SprmIterator public boolean hasNext() { - return _offset < _grpprl.length; + // A Sprm is at least 2 bytes long + return _offset < (_grpprl.length-1); } public SprmOperation next() diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java b/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java index d739136e76..6b4dd4514a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java @@ -20,22 +20,68 @@ package org.apache.poi.hwpf; import junit.framework.Test; import junit.framework.TestSuite; -import org.apache.poi.hwpf.model.*; +import org.apache.poi.hwpf.extractor.TestWordExtractor; +import org.apache.poi.hwpf.extractor.TestWordExtractorBugs; +import org.apache.poi.hwpf.model.TestCHPBinTable; +import org.apache.poi.hwpf.model.TestDocumentProperties; +import org.apache.poi.hwpf.model.TestFileInformationBlock; +import org.apache.poi.hwpf.model.TestFontTable; +import org.apache.poi.hwpf.model.TestListTables; +import org.apache.poi.hwpf.model.TestPAPBinTable; +import org.apache.poi.hwpf.model.TestPlexOfCps; +import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable; +import org.apache.poi.hwpf.model.TestSavedByTable; +import org.apache.poi.hwpf.model.TestSectionTable; +import org.apache.poi.hwpf.model.TestStyleSheet; +import org.apache.poi.hwpf.model.TestTextPieceTable; +import org.apache.poi.hwpf.usermodel.TestBug46610; +import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument; +import org.apache.poi.hwpf.usermodel.TestHeaderStories; +import org.apache.poi.hwpf.usermodel.TestPictures; +import org.apache.poi.hwpf.usermodel.TestProblems; +import org.apache.poi.hwpf.usermodel.TestRange; +import org.apache.poi.hwpf.usermodel.TestRangeDelete; +import org.apache.poi.hwpf.usermodel.TestRangeInsertion; +import org.apache.poi.hwpf.usermodel.TestRangeProperties; +import org.apache.poi.hwpf.usermodel.TestRangeReplacement; +import org.apache.poi.hwpf.usermodel.TestShapes; public final class AllHWPFTests { public static Test suite() { TestSuite suite = new TestSuite(AllHWPFTests.class.getName()); + + suite.addTestSuite(TestHWPFPictures.class); + suite.addTestSuite(TestHWPFRangeParts.class); + + suite.addTestSuite(TestWordExtractor.class); + suite.addTestSuite(TestWordExtractorBugs.class); + suite.addTestSuite(TestCHPBinTable.class); suite.addTestSuite(TestDocumentProperties.class); suite.addTestSuite(TestFileInformationBlock.class); suite.addTestSuite(TestFontTable.class); + suite.addTestSuite(TestListTables.class); suite.addTestSuite(TestPAPBinTable.class); suite.addTestSuite(TestPlexOfCps.class); + suite.addTestSuite(TestRevisionMarkAuthorTable.class); + suite.addTestSuite(TestSavedByTable.class); suite.addTestSuite(TestSectionTable.class); suite.addTestSuite(TestStyleSheet.class); suite.addTestSuite(TestTextPieceTable.class); - suite.addTestSuite(TestListTables.class); + + suite.addTestSuite(TestBug46610.class); + suite.addTestSuite(TestHeaderStories.class); + suite.addTestSuite(TestHWPFOldDocument.class); + suite.addTestSuite(TestPictures.class); + suite.addTestSuite(TestProblems.class); + suite.addTestSuite(TestRange.class); + suite.addTestSuite(TestRangeDelete.class); + suite.addTestSuite(TestRangeInsertion.class); + suite.addTestSuite(TestRangeProperties.class); + suite.addTestSuite(TestRangeReplacement.class); + suite.addTestSuite(TestShapes.class); + return suite; } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java b/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java index 706fe55fa1..2c5253d9cf 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java @@ -17,6 +17,7 @@ package org.apache.poi.hwpf; import org.apache.poi.POIDataSamples; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import java.io.*; @@ -30,6 +31,14 @@ public class HWPFTestDataSamples { throw new RuntimeException(e); } } + public static HWPFOldDocument openOldSampleFile(String sampleFileName) { + try { + InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName); + return new HWPFOldDocument(new POIFSFileSystem(is)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } /** * Writes a spreadsheet to a ByteArrayOutputStream and reads it back * from a ByteArrayInputStream.

diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index ea69824daa..1ef21db3a5 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -52,7 +52,7 @@ public final class TestWordExtractor extends TestCase { // Well behaved document private WordExtractor extractor; - // Corrupted document - can't do paragraph based stuff + // Slightly iffy document private WordExtractor extractor2; // A word doc embeded in an excel file private String filename3; @@ -93,8 +93,11 @@ public final class TestWordExtractor extends TestCase { assertEquals(p_text1[i], text[i]); } - // On second one, should fall back - assertEquals(1, extractor2.getParagraphText().length); + // Lots of paragraphs with only a few lines in them + assertEquals(24, extractor2.getParagraphText().length); + assertEquals("as d\r\n", extractor2.getParagraphText()[16]); + assertEquals("as d\r\n", extractor2.getParagraphText()[17]); + assertEquals("as d\r\n", extractor2.getParagraphText()[18]); } /** @@ -103,8 +106,11 @@ public final class TestWordExtractor extends TestCase { public void testGetText() { assertEquals(p_text1_block, extractor.getText()); - // On second one, should fall back to text piece - assertEquals(extractor2.getTextFromPieces(), extractor2.getText()); + // For the 2nd, should give similar answers for + // the two methods, differing only in line endings + assertEquals( + extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), + extractor2.getText().replaceAll("[\\r\\n]", "")); } /** diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java new file mode 100644 index 0000000000..fc20c7157f --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java @@ -0,0 +1,122 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.usermodel; + +import org.apache.poi.OldFileFormatException; +import org.apache.poi.hwpf.HWPFOldDocument; +import org.apache.poi.hwpf.HWPFTestCase; +import org.apache.poi.hwpf.HWPFTestDataSamples; + +/** + * Tests for Word 6 and Word 95 support + */ +public final class TestHWPFOldDocument extends HWPFTestCase { + /** + * Test a simple Word 6 document + */ + public void testWord6() throws Exception { + // Can't open as HWPFDocument + try { + HWPFTestDataSamples.openSampleFile("Word6.doc"); + fail("Shouldn't be openable"); + } catch(OldFileFormatException e) {} + + // Open + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc"); + + // Check + assertEquals(1, doc.getRange().numSections()); + assertEquals(1, doc.getRange().numParagraphs()); + assertEquals(1, doc.getRange().numCharacterRuns()); + + assertEquals( + "The quick brown fox jumps over the lazy dog\r", + doc.getRange().getParagraph(0).text() + ); + } + + /** + * Test a simple Word 95 document + */ + public void testWord95() throws Exception { + // Can't open as HWPFDocument + try { + HWPFTestDataSamples.openSampleFile("Word95.doc"); + fail("Shouldn't be openable"); + } catch(OldFileFormatException e) {} + + // Open + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc"); + + // Check + assertEquals(1, doc.getRange().numSections()); + assertEquals(7, doc.getRange().numParagraphs()); + + assertEquals( + "The quick brown fox jumps over the lazy dog\r", + doc.getRange().getParagraph(0).text() + ); + assertEquals("\r", doc.getRange().getParagraph(1).text()); + assertEquals( + "Paragraph 2\r", + doc.getRange().getParagraph(2).text() + ); + assertEquals("\r", doc.getRange().getParagraph(3).text()); + assertEquals( + "Paragraph 3. Has some RED text and some " + + "BLUE BOLD text in it.\r", + doc.getRange().getParagraph(4).text() + ); + assertEquals("\r", doc.getRange().getParagraph(5).text()); + assertEquals( + "Last (4th) paragraph.\r", + doc.getRange().getParagraph(6).text() + ); + + assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns()); + assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns()); + assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns()); + assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns()); + // Normal, red, normal, blue+bold, normal + assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns()); + assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns()); + // Normal, superscript for 4th, normal + assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns()); + } + + /** + * Test a word document that has sections, + * as well as the usual paragraph stuff. + */ + public void testWord6Sections() throws Exception { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc"); + + assertEquals(3, doc.getRange().numSections()); + assertEquals(6, doc.getRange().numParagraphs()); + + assertEquals( + "This is a test.\r", + doc.getRange().getParagraph(0).text() + ); + assertEquals("\r", doc.getRange().getParagraph(1).text()); + assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line? + assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text()); + assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line? + assertEquals("\r", doc.getRange().getParagraph(5).text()); + } +} diff --git a/test-data/document/Word6_sections.doc b/test-data/document/Word6_sections.doc new file mode 100644 index 0000000000..e7ac989356 Binary files /dev/null and b/test-data/document/Word6_sections.doc differ