From: Nick Burch Date: Mon, 13 Feb 2006 12:59:00 +0000 (+0000) Subject: Friendly wrapper on HWPF for extracting text from Word Documents X-Git-Tag: REL_3_0_ALPHA3~178 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=5c8d2a20d11afb30d23a31a95b5c06198302447d;p=poi.git Friendly wrapper on HWPF for extracting text from Word Documents git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@377372 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc new file mode 100755 index 0000000000..06921df395 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java new file mode 100644 index 0000000000..23ff64f1a3 --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java @@ -0,0 +1,87 @@ +package org.apache.poi.hwpf.extractor; + +import java.io.FileInputStream; +import java.util.Iterator; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; + +import junit.framework.TestCase; + +/** + * Test the different routes to extracting text + * + * @author Nick Burch (nick at torchbox dot com) + */ +public class TestDifferentRoutes extends TestCase { + private String[] p_text = new String[] { + "This is a simple word document\r", + "\r", + "It has a number of paragraphs in it\r", + "\r", + "Some of them even feature bold, italic and underlined text\r", + "\r", + "\r", + "This bit is in a different font and size\r", + "\r", + "\r", + "This bit features some red text.\r", + "\r", + "\r", + "It is otherwise very very boring.\r" + }; + + private HWPFDocument doc; + + protected void setUp() throws Exception { + String dirname = System.getProperty("HWPF.testdata.path"); + + String filename = dirname + "/test2.doc"; + doc = new HWPFDocument(new FileInputStream(filename)); + } + + /** + * Test model based extraction + */ + public void testExtractFromModel() { + Range r = doc.getRange(); + + String[] text = new String[r.numParagraphs()]; + for(int i=0; i < r.numParagraphs(); i++) { + Paragraph p = r.getParagraph(i); + text[i] = p.text(); + } + + assertEquals(p_text.length, text.length); + for(int i=0; i flat extraction + */ + public void testGetText() { + assertEquals(p_text1_block, extractor.getText()); + + // On second one, should fall back to text piece + assertEquals(extractor2.getTextFromPieces(), extractor2.getText()); + } + + /** + * Test textPieces based extraction + */ + public void testExtractFromTextPieces() throws Exception { + String text = extractor.getTextFromPieces(); + assertEquals(p_text1_block, text); + } +}