From 5bacacee859d1e15a88ec24cbfe5c1345388cbb6 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Mon, 13 Feb 2006 12:58:52 +0000 Subject: [PATCH] Friendly wrapper on HWPF for extracting text from Word Documents git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@377371 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/hwpf/extractor/WordExtractor.java | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java new file mode 100644 index 0000000000..639fd36ac6 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -0,0 +1,123 @@ +package org.apache.poi.hwpf.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Iterator; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * Class to extract the text from a Word Document. + * + * You should use either getParagraphText() or getText() unless + * you have a strong reason otherwise. + * + * @author Nick Burch (nick at torchbox dot com) + */ +public class WordExtractor { + private POIFSFileSystem fs; + private HWPFDocument doc; + + /** + * Create a new Word Extractor + * @param is InputStream containing the word file + */ + public WordExtractor(InputStream is) throws IOException { + this(new POIFSFileSystem(is)); + } + + /** + * Create a new Word Extractor + * @param fs POIFSFileSystem containing the word file + */ + public WordExtractor(POIFSFileSystem fs) throws IOException { + this.fs = fs; + doc = new HWPFDocument(fs); + } + + /** + * Get the text from the word file, as an array with one String + * per paragraph + */ + public String[] getParagraphText() { + String[] ret; + + // Extract using the model code + try { + Range r = doc.getRange(); + + ret = new String[r.numParagraphs()]; + for(int i=0; i paragraph + * mapping is broken. Fast too. + */ + public String getTextFromPieces() { + StringBuffer textBuf = new StringBuffer(); + + Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); + while (textPieces.hasNext()) { + TextPiece piece = (TextPiece) textPieces.next(); + + String encoding = "Cp1252"; + if (piece.usesUnicode()) { + encoding = "UTF-16LE"; + } + try { + String text = new String(piece.getRawBytes(), encoding); + textBuf.append(text); + } catch(UnsupportedEncodingException e) { + throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); + } + } + + String text = textBuf.toString(); + + // Fix line endings (Note - won't get all of them + text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); + text = text.replaceAll("\r\r", "\r\n\r\n"); + + if(text.endsWith("\r")) { + text += "\n"; + } + + return text; + } + + /* + * Grab the text, based on the paragraphs. Shouldn't include any crud, + * but slightly slower than getTextFromPieces() + */ + public String getText() { + StringBuffer ret = new StringBuffer(); + String[] text = getParagraphText(); + for(int i=0; i