--- /dev/null
+package org.apache.poi.hwpf.extractor;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+ * Class to extract the text from a Word Document.
+ *
+ * You should use either getParagraphText() or getText() unless
+ * you have a strong reason otherwise.
+ *
+ * @author Nick Burch (nick at torchbox dot com)
+ */
+public class WordExtractor {
+ private POIFSFileSystem fs;
+ private HWPFDocument doc;
+ /**
+ * Create a new Word Extractor
+ * @param is InputStream containing the word file
+ */
+ public WordExtractor(InputStream is) throws IOException {
+ this(new POIFSFileSystem(is));
+ }
+ /**
+ * Create a new Word Extractor
+ * @param fs POIFSFileSystem containing the word file
+ */
+ public WordExtractor(POIFSFileSystem fs) throws IOException {
+ this.fs = fs;
+ doc = new HWPFDocument(fs);
+ }
+ /**
+ * Get the text from the word file, as an array with one String
+ * per paragraph
+ */
+ public String[] getParagraphText() {
+ String[] ret;
+ // Extract using the model code
+ try {
+ Range r = doc.getRange();
+ ret = new String[r.numParagraphs()];
+ for(int i=0; i<ret.length; i++) {
+ Paragraph p = r.getParagraph(i);
+ ret[i] = p.text();
+ // Fix the line ending
+ if(ret[i].endsWith("\r")) {
+ ret[i] = ret[i] + "\n";
+ }
+ }
+ } catch(Exception e) {
+ // Something's up with turning the text pieces into paragraphs
+ // Fall back to ripping out the text pieces
+ ret = new String[1];
+ ret[0] = getTextFromPieces();
+ }
+ return ret;
+ }
+ /**
+ * Grab the text out of the text pieces. Might also include various
+ * bits of crud, but will work in cases where the text piece -> paragraph
+ * mapping is broken. Fast too.
+ */
+ public String getTextFromPieces() {
+ StringBuffer textBuf = new StringBuffer();
+ Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
+ while (textPieces.hasNext()) {
+ TextPiece piece = (TextPiece) textPieces.next();
+ String encoding = "Cp1252";
+ if (piece.usesUnicode()) {
+ encoding = "UTF-16LE";
+ }
+ try {
+ String text = new String(piece.getRawBytes(), encoding);
+ textBuf.append(text);
+ } catch(UnsupportedEncodingException e) {
+ throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
+ }
+ }
+ String text = textBuf.toString();
+ // Fix line endings (Note - won't get all of them
+ text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
+ text = text.replaceAll("\r\r", "\r\n\r\n");
+ if(text.endsWith("\r")) {
+ text += "\n";
+ }
+ return text;
+ }
+ /*
+ * Grab the text, based on the paragraphs. Shouldn't include any crud,
+ * but slightly slower than getTextFromPieces()
+ */
+ public String getText() {
+ StringBuffer ret = new StringBuffer();
+ String[] text = getParagraphText();
+ for(int i=0; i<text.length; i++) {
+ ret.append(text[i]);
+ }
+ return ret.toString();
+ }