You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

WordExtractor.java 8.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.extractor;
  16. import java.io.FileInputStream;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import org.apache.poi.extractor.POIOLE2TextExtractor;
  20. import org.apache.poi.hwpf.HWPFDocument;
  21. import org.apache.poi.hwpf.HWPFDocumentCore;
  22. import org.apache.poi.hwpf.converter.WordToTextConverter;
  23. import org.apache.poi.hwpf.usermodel.HeaderStories;
  24. import org.apache.poi.hwpf.usermodel.Paragraph;
  25. import org.apache.poi.hwpf.usermodel.Range;
  26. import org.apache.poi.poifs.filesystem.DirectoryNode;
  27. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  28. /**
  29. * Class to extract the text from a Word Document.
  30. *
  31. * You should use either getParagraphText() or getText() unless you have a
  32. * strong reason otherwise.
  33. *
  34. * @author Nick Burch
  35. */
  36. public final class WordExtractor extends POIOLE2TextExtractor {
  37. private HWPFDocument doc;
  38. /**
  39. * Create a new Word Extractor
  40. *
  41. * @param is
  42. * InputStream containing the word file
  43. */
  44. public WordExtractor( InputStream is ) throws IOException {
  45. this(HWPFDocumentCore.verifyAndBuildPOIFS(is ) );
  46. }
  47. /**
  48. * Create a new Word Extractor
  49. *
  50. * @param fs
  51. * POIFSFileSystem containing the word file
  52. */
  53. public WordExtractor( POIFSFileSystem fs ) throws IOException {
  54. this( new HWPFDocument( fs ) );
  55. }
  56. public WordExtractor( DirectoryNode dir ) throws IOException {
  57. this( new HWPFDocument( dir ) );
  58. }
  59. /**
  60. * Create a new Word Extractor
  61. *
  62. * @param doc
  63. * The HWPFDocument to extract from
  64. */
  65. public WordExtractor( HWPFDocument doc ) {
  66. super( doc );
  67. this.doc = doc;
  68. }
  69. /**
  70. * Command line extractor, so people will stop moaning that they can't just
  71. * run this.
  72. */
  73. public static void main( String[] args ) throws IOException {
  74. if ( args.length == 0 ) {
  75. System.err.println( "Use:" );
  76. System.err
  77. .println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
  78. System.exit( 1 );
  79. }
  80. // Process the first argument as a file
  81. InputStream fin = new FileInputStream( args[0] );
  82. try (WordExtractor extractor = new WordExtractor(fin)) {
  83. System.out.println(extractor.getText());
  84. }
  85. }
  86. /**
  87. * Get the text from the word file, as an array with one String per
  88. * paragraph
  89. */
  90. public String[] getParagraphText() {
  91. String[] ret;
  92. // Extract using the model code
  93. try {
  94. Range r = doc.getRange();
  95. ret = getParagraphText( r );
  96. } catch ( Exception e ) {
  97. // Something's up with turning the text pieces into paragraphs
  98. // Fall back to ripping out the text pieces
  99. ret = new String[1];
  100. ret[0] = getTextFromPieces();
  101. }
  102. return ret;
  103. }
  104. public String[] getFootnoteText() {
  105. Range r = doc.getFootnoteRange();
  106. return getParagraphText( r );
  107. }
  108. public String[] getMainTextboxText() {
  109. Range r = doc.getMainTextboxRange();
  110. return getParagraphText( r );
  111. }
  112. public String[] getEndnoteText() {
  113. Range r = doc.getEndnoteRange();
  114. return getParagraphText( r );
  115. }
  116. public String[] getCommentsText() {
  117. Range r = doc.getCommentsRange();
  118. return getParagraphText( r );
  119. }
  120. protected static String[] getParagraphText( Range r ) {
  121. String[] ret;
  122. ret = new String[r.numParagraphs()];
  123. for ( int i = 0; i < ret.length; i++ ) {
  124. Paragraph p = r.getParagraph( i );
  125. ret[i] = p.text();
  126. // Fix the line ending
  127. if ( ret[i].endsWith( "\r" )) {
  128. ret[i] = ret[i] + "\n";
  129. }
  130. }
  131. return ret;
  132. }
  133. /**
  134. * Add the header/footer text, if it's not empty
  135. */
  136. private void appendHeaderFooter( String text, StringBuilder out ) {
  137. if ( text == null || text.length() == 0 )
  138. return;
  139. text = text.replace( '\r', '\n' );
  140. if ( !text.endsWith( "\n" ))
  141. {
  142. out.append( text );
  143. out.append( '\n' );
  144. return;
  145. }
  146. if ( text.endsWith( "\n\n" ))
  147. {
  148. out.append(text, 0, text.length() - 1);
  149. return;
  150. }
  151. out.append( text );
  152. }
  153. /**
  154. * Grab the text from the headers
  155. * @deprecated 3.8 beta 4
  156. */
  157. @Deprecated
  158. public String getHeaderText() {
  159. HeaderStories hs = new HeaderStories( doc );
  160. StringBuilder ret = new StringBuilder();
  161. if ( hs.getFirstHeader() != null ) {
  162. appendHeaderFooter( hs.getFirstHeader(), ret );
  163. }
  164. if ( hs.getEvenHeader() != null ) {
  165. appendHeaderFooter( hs.getEvenHeader(), ret );
  166. }
  167. if ( hs.getOddHeader() != null ) {
  168. appendHeaderFooter( hs.getOddHeader(), ret );
  169. }
  170. return ret.toString();
  171. }
  172. /**
  173. * Grab the text from the footers
  174. * @deprecated 3.8 beta 4
  175. */
  176. @Deprecated
  177. public String getFooterText() {
  178. HeaderStories hs = new HeaderStories( doc );
  179. StringBuilder ret = new StringBuilder();
  180. if ( hs.getFirstFooter() != null ) {
  181. appendHeaderFooter( hs.getFirstFooter(), ret );
  182. }
  183. if ( hs.getEvenFooter() != null ) {
  184. appendHeaderFooter( hs.getEvenFooter(), ret );
  185. }
  186. if ( hs.getOddFooter() != null ) {
  187. appendHeaderFooter( hs.getOddFooter(), ret );
  188. }
  189. return ret.toString();
  190. }
  191. /**
  192. * Grab the text out of the text pieces. Might also include various bits of
  193. * crud, but will work in cases where the text piece -> paragraph mapping is
  194. * broken. Fast too.
  195. */
  196. public String getTextFromPieces() {
  197. String text = doc.getDocumentText();
  198. // Fix line endings (Note - won't get all of them
  199. text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
  200. text = text.replaceAll( "\r\r", "\r\n\r\n" );
  201. if ( text.endsWith( "\r" )) {
  202. text += "\n";
  203. }
  204. return text;
  205. }
  206. /**
  207. * Grab the text, based on the WordToTextConverter. Shouldn't include any
  208. * crud, but slower than getTextFromPieces().
  209. */
  210. public String getText() {
  211. try {
  212. WordToTextConverter wordToTextConverter = new WordToTextConverter();
  213. HeaderStories hs = new HeaderStories(doc);
  214. if (hs.getFirstHeaderSubrange() != null)
  215. wordToTextConverter.processDocumentPart(doc,
  216. hs.getFirstHeaderSubrange());
  217. if (hs.getEvenHeaderSubrange() != null)
  218. wordToTextConverter.processDocumentPart(doc,
  219. hs.getEvenHeaderSubrange());
  220. if (hs.getOddHeaderSubrange() != null)
  221. wordToTextConverter.processDocumentPart(doc,
  222. hs.getOddHeaderSubrange());
  223. wordToTextConverter.processDocument(doc);
  224. wordToTextConverter.processDocumentPart(doc,
  225. doc.getMainTextboxRange());
  226. if (hs.getFirstFooterSubrange() != null)
  227. wordToTextConverter.processDocumentPart(doc,
  228. hs.getFirstFooterSubrange());
  229. if (hs.getEvenFooterSubrange() != null)
  230. wordToTextConverter.processDocumentPart(doc,
  231. hs.getEvenFooterSubrange());
  232. if (hs.getOddFooterSubrange() != null)
  233. wordToTextConverter.processDocumentPart(doc,
  234. hs.getOddFooterSubrange());
  235. return wordToTextConverter.getText();
  236. } catch (RuntimeException e) {
  237. throw e;
  238. } catch ( Exception exc ) {
  239. throw new RuntimeException( exc );
  240. }
  241. }
  242. /**
  243. * Removes any fields (eg macros, page markers etc) from the string.
  244. */
  245. public static String stripFields( String text )
  246. {
  247. return Range.stripFields( text );
  248. }
  249. }