<changes>
<release version="3.7-beta3" date="2010-??-??">
+ <action dev="poi-developers" type="fix">49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument)</action>
<action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action>
<action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action>
<action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action>
* but slightly slower than getTextFromPieces().
*/
public String getText() {
- StringBuffer ret = new StringBuffer();
+ StringBuffer ret = new StringBuffer();
- ret.append(getHeaderText());
+ ret.append(getHeaderText());
- ArrayList<String> text = new ArrayList<String>();
- text.addAll(Arrays.asList(getParagraphText()));
- text.addAll(Arrays.asList(getFootnoteText()));
- text.addAll(Arrays.asList(getEndnoteText()));
+ ArrayList<String> text = new ArrayList<String>();
+ text.addAll(Arrays.asList(getParagraphText()));
+ text.addAll(Arrays.asList(getFootnoteText()));
+ text.addAll(Arrays.asList(getEndnoteText()));
- for(String p : text) {
- ret.append(p);
- }
+ for(String p : text) {
+ ret.append(p);
+ }
- ret.append(getFooterText());
+ ret.append(getFooterText());
- return ret.toString();
+ return ret.toString();
}
/**
TextPieceTable tpt)
{
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
+ CharIsBytes charConv = new CharIsBytes(tpt);
int length = sedPlex.length();
// check for the optimization
if (fileOffset == 0xffffffff)
{
- _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
+ _sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0]));
}
else
{
byte[] buf = new byte[sepxSize];
fileOffset += LittleEndian.SHORT_SIZE;
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
- _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
+ _sections.add(new SEPX(sed, startAt, endAt, charConv, buf));
}
}
}
+
+ private static class CharIsBytes implements CharIndexTranslator {
+ private TextPieceTable tpt;
+ private CharIsBytes(TextPieceTable tpt) {
+ this.tpt = tpt;
+ }
+
+ public int getCharIndex(int bytePos, int startCP) {
+ return bytePos;
+ }
+ public int getCharIndex(int bytePos) {
+ return bytePos;
+ }
+
+ public boolean isIndexInTable(int bytePos) {
+ return tpt.isIndexInTable(bytePos);
+ }
+ public int lookIndexBackward(int bytePos) {
+ return tpt.lookIndexBackward(bytePos);
+ }
+ public int lookIndexForward(int bytePos) {
+ return tpt.lookIndexForward(bytePos);
+ }
+ }
}
if (bytePos< pieceStart || bytePos > pieceEnd) {
toAdd = bytesLength;
+ } else if (bytePos > pieceStart && bytePos < pieceEnd) {
+ toAdd = (bytePos - pieceStart);
} else {
toAdd = bytesLength - (pieceEnd - bytePos);
}
public boolean hasNext()
{
- return _offset < _grpprl.length;
+ // A Sprm is at least 2 bytes long
+ return _offset < (_grpprl.length-1);
}
public SprmOperation next()
import junit.framework.Test;
import junit.framework.TestSuite;
-import org.apache.poi.hwpf.model.*;
+import org.apache.poi.hwpf.extractor.TestWordExtractor;
+import org.apache.poi.hwpf.extractor.TestWordExtractorBugs;
+import org.apache.poi.hwpf.model.TestCHPBinTable;
+import org.apache.poi.hwpf.model.TestDocumentProperties;
+import org.apache.poi.hwpf.model.TestFileInformationBlock;
+import org.apache.poi.hwpf.model.TestFontTable;
+import org.apache.poi.hwpf.model.TestListTables;
+import org.apache.poi.hwpf.model.TestPAPBinTable;
+import org.apache.poi.hwpf.model.TestPlexOfCps;
+import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable;
+import org.apache.poi.hwpf.model.TestSavedByTable;
+import org.apache.poi.hwpf.model.TestSectionTable;
+import org.apache.poi.hwpf.model.TestStyleSheet;
+import org.apache.poi.hwpf.model.TestTextPieceTable;
+import org.apache.poi.hwpf.usermodel.TestBug46610;
+import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument;
+import org.apache.poi.hwpf.usermodel.TestHeaderStories;
+import org.apache.poi.hwpf.usermodel.TestPictures;
+import org.apache.poi.hwpf.usermodel.TestProblems;
+import org.apache.poi.hwpf.usermodel.TestRange;
+import org.apache.poi.hwpf.usermodel.TestRangeDelete;
+import org.apache.poi.hwpf.usermodel.TestRangeInsertion;
+import org.apache.poi.hwpf.usermodel.TestRangeProperties;
+import org.apache.poi.hwpf.usermodel.TestRangeReplacement;
+import org.apache.poi.hwpf.usermodel.TestShapes;
public final class AllHWPFTests {
public static Test suite() {
TestSuite suite = new TestSuite(AllHWPFTests.class.getName());
+
+ suite.addTestSuite(TestHWPFPictures.class);
+ suite.addTestSuite(TestHWPFRangeParts.class);
+
+ suite.addTestSuite(TestWordExtractor.class);
+ suite.addTestSuite(TestWordExtractorBugs.class);
+
suite.addTestSuite(TestCHPBinTable.class);
suite.addTestSuite(TestDocumentProperties.class);
suite.addTestSuite(TestFileInformationBlock.class);
suite.addTestSuite(TestFontTable.class);
+ suite.addTestSuite(TestListTables.class);
suite.addTestSuite(TestPAPBinTable.class);
suite.addTestSuite(TestPlexOfCps.class);
+ suite.addTestSuite(TestRevisionMarkAuthorTable.class);
+ suite.addTestSuite(TestSavedByTable.class);
suite.addTestSuite(TestSectionTable.class);
suite.addTestSuite(TestStyleSheet.class);
suite.addTestSuite(TestTextPieceTable.class);
- suite.addTestSuite(TestListTables.class);
+
+ suite.addTestSuite(TestBug46610.class);
+ suite.addTestSuite(TestHeaderStories.class);
+ suite.addTestSuite(TestHWPFOldDocument.class);
+ suite.addTestSuite(TestPictures.class);
+ suite.addTestSuite(TestProblems.class);
+ suite.addTestSuite(TestRange.class);
+ suite.addTestSuite(TestRangeDelete.class);
+ suite.addTestSuite(TestRangeInsertion.class);
+ suite.addTestSuite(TestRangeProperties.class);
+ suite.addTestSuite(TestRangeReplacement.class);
+ suite.addTestSuite(TestShapes.class);
+
return suite;
}
}
package org.apache.poi.hwpf;\r
\r
import org.apache.poi.POIDataSamples;\r
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;\r
\r
import java.io.*;\r
\r
throw new RuntimeException(e);\r
}\r
}\r
+ public static HWPFOldDocument openOldSampleFile(String sampleFileName) {\r
+ try {\r
+ InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);\r
+ return new HWPFOldDocument(new POIFSFileSystem(is));\r
+ } catch (IOException e) {\r
+ throw new RuntimeException(e);\r
+ }\r
+ }\r
/**\r
* Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back\r
* from a <tt>ByteArrayInputStream</tt>.<p/>\r
// Well behaved document
private WordExtractor extractor;
- // Corrupted document - can't do paragraph based stuff
+ // Slightly iffy document
private WordExtractor extractor2;
// A word doc embeded in an excel file
private String filename3;
assertEquals(p_text1[i], text[i]);
}
- // On second one, should fall back
- assertEquals(1, extractor2.getParagraphText().length);
+ // Lots of paragraphs with only a few lines in them
+ assertEquals(24, extractor2.getParagraphText().length);
+ assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
+ assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
+ assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
}
/**
public void testGetText() {
assertEquals(p_text1_block, extractor.getText());
- // On second one, should fall back to text piece
- assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
+ // For the 2nd, should give similar answers for
+ // the two methods, differing only in line endings
+ assertEquals(
+ extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
+ extractor2.getText().replaceAll("[\\r\\n]", ""));
}
/**
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.OldFileFormatException;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.HWPFTestCase;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+/**
+ * Tests for Word 6 and Word 95 support
+ */
+public final class TestHWPFOldDocument extends HWPFTestCase {
+ /**
+ * Test a simple Word 6 document
+ */
+ public void testWord6() throws Exception {
+ // Can't open as HWPFDocument
+ try {
+ HWPFTestDataSamples.openSampleFile("Word6.doc");
+ fail("Shouldn't be openable");
+ } catch(OldFileFormatException e) {}
+
+ // Open
+ HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
+
+ // Check
+ assertEquals(1, doc.getRange().numSections());
+ assertEquals(1, doc.getRange().numParagraphs());
+ assertEquals(1, doc.getRange().numCharacterRuns());
+
+ assertEquals(
+ "The quick brown fox jumps over the lazy dog\r",
+ doc.getRange().getParagraph(0).text()
+ );
+ }
+
+ /**
+ * Test a simple Word 95 document
+ */
+ public void testWord95() throws Exception {
+ // Can't open as HWPFDocument
+ try {
+ HWPFTestDataSamples.openSampleFile("Word95.doc");
+ fail("Shouldn't be openable");
+ } catch(OldFileFormatException e) {}
+
+ // Open
+ HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc");
+
+ // Check
+ assertEquals(1, doc.getRange().numSections());
+ assertEquals(7, doc.getRange().numParagraphs());
+
+ assertEquals(
+ "The quick brown fox jumps over the lazy dog\r",
+ doc.getRange().getParagraph(0).text()
+ );
+ assertEquals("\r", doc.getRange().getParagraph(1).text());
+ assertEquals(
+ "Paragraph 2\r",
+ doc.getRange().getParagraph(2).text()
+ );
+ assertEquals("\r", doc.getRange().getParagraph(3).text());
+ assertEquals(
+ "Paragraph 3. Has some RED text and some " +
+ "BLUE BOLD text in it.\r",
+ doc.getRange().getParagraph(4).text()
+ );
+ assertEquals("\r", doc.getRange().getParagraph(5).text());
+ assertEquals(
+ "Last (4th) paragraph.\r",
+ doc.getRange().getParagraph(6).text()
+ );
+
+ assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
+ assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
+ assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
+ assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
+ // Normal, red, normal, blue+bold, normal
+ assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
+ assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
+ // Normal, superscript for 4th, normal
+ assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
+ }
+
+ /**
+ * Test a word document that has sections,
+ * as well as the usual paragraph stuff.
+ */
+ public void testWord6Sections() throws Exception {
+ HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
+
+ assertEquals(3, doc.getRange().numSections());
+ assertEquals(6, doc.getRange().numParagraphs());
+
+ assertEquals(
+ "This is a test.\r",
+ doc.getRange().getParagraph(0).text()
+ );
+ assertEquals("\r", doc.getRange().getParagraph(1).text());
+ assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line?
+ assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text());
+ assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line?
+ assertEquals("\r", doc.getRange().getParagraph(5).text());
+ }
+}