Friendly wrapper on HWPF for extracting text from Word Documents

author Nick Burch <nick@apache.org>

Mon, 13 Feb 2006 12:59:00 +0000 (12:59 +0000)

committer Nick Burch <nick@apache.org>

Mon, 13 Feb 2006 12:59:00 +0000 (12:59 +0000)
author Nick Burch <nick@apache.org>
Mon, 13 Feb 2006 12:59:00 +0000 (12:59 +0000)
committer Nick Burch <nick@apache.org>
Mon, 13 Feb 2006 12:59:00 +0000 (12:59 +0000)
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc

new file mode 100755 (executable)

index 0000000..06921df

Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc differ
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java

new file mode 100644 (file)

index 0000000..23ff64f
--- /dev/null
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
@@ -0,0 +1,87 @@
+package org.apache.poi.hwpf.extractor;
+
+import java.io.FileInputStream;
+import java.util.Iterator;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+
+import junit.framework.TestCase;
+
+/**
+ * Test the different routes to extracting text
+ *
+ * @author Nick Burch (nick at torchbox dot com)
+ */
+public class TestDifferentRoutes extends TestCase {
+       private String[] p_text = new String[] {
+                       "This is a simple word document\r",
+                       "\r",
+                       "It has a number of paragraphs in it\r",
+                       "\r",
+                       "Some of them even feature bold, italic and underlined text\r",
+                       "\r",
+                       "\r",
+                       "This bit is in a different font and size\r",
+                       "\r",
+                       "\r",
+                       "This bit features some red text.\r",
+                       "\r",
+                       "\r",
+                       "It is otherwise very very boring.\r"
+       };
+       
+       private HWPFDocument doc;
+       
+    protected void setUp() throws Exception {
+               String dirname = System.getProperty("HWPF.testdata.path");
+               
+               String filename = dirname + "/test2.doc";
+               doc = new HWPFDocument(new FileInputStream(filename));
+    }                  
+    
+    /**
+     * Test model based extraction
+     */
+    public void testExtractFromModel() {
+       Range r = doc.getRange();
+       
+       String[] text = new String[r.numParagraphs()];
+       for(int i=0; i < r.numParagraphs(); i++) {
+               Paragraph p = r.getParagraph(i);
+               text[i] = p.text();
+       }
+       
+       assertEquals(p_text.length, text.length);
+       for(int i=0; i<p_text.length; i++) {
+               assertEquals(p_text[i], text[i]);
+       }
+    }
+    
+    /**
+     * Test textPieces based extraction
+     */
+    public void testExtractFromTextPieces() throws Exception {
+       StringBuffer textBuf = new StringBuffer();
+       
+       Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
+       while (textPieces.hasNext()) {
+               TextPiece piece = (TextPiece) textPieces.next();
+
+               String encoding = "Cp1252";
+               if (piece.usesUnicode()) {
+                       encoding = "UTF-16LE";
+               }
+               String text = new String(piece.getRawBytes(), encoding);
+               textBuf.append(text);
+       }
+       
+       StringBuffer exp = new StringBuffer();
+       for(int i=0; i<p_text.length; i++) {
+               exp.append(p_text[i]);
+       }
+       assertEquals(exp.toString(), textBuf.toString());
+    }
+}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

new file mode 100644 (file)

index 0000000..2ea8c98
--- /dev/null
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@@ -0,0 +1,88 @@
+package org.apache.poi.hwpf.extractor;
+
+import java.io.FileInputStream;
+import java.util.Iterator;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+
+import junit.framework.TestCase;
+
+/**
+ * Test the different routes to extracting text
+ *
+ * @author Nick Burch (nick at torchbox dot com)
+ */
+public class TestWordExtractor extends TestCase {
+       private String[] p_text1 = new String[] {
+                       "This is a simple word document\r\n",
+                       "\r\n",
+                       "It has a number of paragraphs in it\r\n",
+                       "\r\n",
+                       "Some of them even feature bold, italic and underlined text\r\n",
+                       "\r\n",
+                       "\r\n",
+                       "This bit is in a different font and size\r\n",
+                       "\r\n",
+                       "\r\n",
+                       "This bit features some red text.\r\n",
+                       "\r\n",
+                       "\r\n",
+                       "It is otherwise very very boring.\r\n"
+       };
+       private String p_text1_block = new String();
+               
+       // Well behaved document
+       private WordExtractor extractor;
+       // Corrupted document - can't do paragraph based stuff
+       private WordExtractor extractor2;
+       
+    protected void setUp() throws Exception {
+               String dirname = System.getProperty("HWPF.testdata.path");
+               
+               String filename = dirname + "/test2.doc";
+               String filename2 = dirname + "/test.doc";
+               extractor = new WordExtractor(new FileInputStream(filename));
+               extractor2 = new WordExtractor(new FileInputStream(filename2));
+               
+               // Build splat'd out text version
+               for(int i=0; i<p_text1.length; i++) {
+                       p_text1_block += p_text1[i];
+               }
+    }                  
+    
+    /**
+     * Test paragraph based extraction
+     */
+    public void testExtractFromParagraphs() {
+       String[] text = extractor.getParagraphText();
+       
+       assertEquals(p_text1.length, text.length);
+       for(int i=0; i<p_text1.length; i++) {
+               assertEquals(p_text1[i], text[i]);
+       }
+       
+       // On second one, should fall back
+       assertEquals(1, extractor2.getParagraphText().length);
+    }
+    
+    /**
+     * Test the paragraph -> flat extraction
+     */
+    public void testGetText() {
+       assertEquals(p_text1_block, extractor.getText());
+       
+       // On second one, should fall back to text piece
+       assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
+    }
+    
+    /**
+     * Test textPieces based extraction
+     */
+    public void testExtractFromTextPieces() throws Exception {
+       String text = extractor.getTextFromPieces();
+       assertEquals(p_text1_block, text);
+    }
+}
author	Nick Burch <nick@apache.org>
	Mon, 13 Feb 2006 12:59:00 +0000 (12:59 +0000)
committer	Nick Burch <nick@apache.org>
	Mon, 13 Feb 2006 12:59:00 +0000 (12:59 +0000)
src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc	[new file with mode: 0755]	patch \| blob
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java	[new file with mode: 0644]	patch \| blob