You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

WordToTextConverter.java 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.converter;
  16. import java.io.File;
  17. import java.io.IOException;
  18. import java.lang.reflect.Method;
  19. import java.util.List;
  20. import java.util.concurrent.atomic.AtomicInteger;
  21. import javax.xml.transform.OutputKeys;
  22. import javax.xml.transform.Transformer;
  23. import javax.xml.transform.dom.DOMSource;
  24. import javax.xml.transform.stream.StreamResult;
  25. import org.apache.commons.io.output.StringBuilderWriter;
  26. import org.apache.logging.log4j.LogManager;
  27. import org.apache.logging.log4j.Logger;
  28. import org.apache.poi.hpsf.SummaryInformation;
  29. import org.apache.poi.hwpf.HWPFDocument;
  30. import org.apache.poi.hwpf.HWPFDocumentCore;
  31. import org.apache.poi.hwpf.usermodel.Bookmark;
  32. import org.apache.poi.hwpf.usermodel.CharacterRun;
  33. import org.apache.poi.hwpf.usermodel.OfficeDrawing;
  34. import org.apache.poi.hwpf.usermodel.Paragraph;
  35. import org.apache.poi.hwpf.usermodel.Picture;
  36. import org.apache.poi.hwpf.usermodel.Range;
  37. import org.apache.poi.hwpf.usermodel.Section;
  38. import org.apache.poi.hwpf.usermodel.Table;
  39. import org.apache.poi.hwpf.usermodel.TableCell;
  40. import org.apache.poi.hwpf.usermodel.TableRow;
  41. import org.apache.poi.poifs.filesystem.DirectoryNode;
  42. import org.apache.poi.poifs.filesystem.Entry;
  43. import org.apache.poi.util.Beta;
  44. import org.apache.poi.util.XMLHelper;
  45. import org.w3c.dom.Document;
  46. import org.w3c.dom.Element;
  47. @Beta
  48. public class WordToTextConverter extends AbstractWordConverter {
  49. private static final Logger LOG = LogManager.getLogger(WordToTextConverter.class);
  50. private static final int MAX_NESTED_CHILD_NODES = 300;
  51. public static String getText( DirectoryNode root ) throws Exception
  52. {
  53. final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
  54. return getText( wordDocument );
  55. }
  56. public static String getText( File docFile ) throws Exception
  57. {
  58. final HWPFDocumentCore wordDocument = AbstractWordUtils
  59. .loadDoc( docFile );
  60. return getText( wordDocument );
  61. }
  62. public static String getText( final HWPFDocumentCore wordDocument )
  63. throws Exception
  64. {
  65. WordToTextConverter wordToTextConverter = new WordToTextConverter(
  66. XMLHelper.newDocumentBuilder().newDocument() );
  67. wordToTextConverter.processDocument( wordDocument );
  68. return wordToTextConverter.getText();
  69. }
  70. /**
  71. * Java main() interface to interact with {@link WordToTextConverter}
  72. *
  73. * <p>
  74. * Usage: WordToTextConverter infile outfile
  75. * </p>
  76. * Where infile is an input .doc file ( Word 95-2007) which will be rendered
  77. * as plain text into outfile
  78. */
  79. public static void main( String[] args ) throws Exception {
  80. if ( args.length < 2 )
  81. {
  82. System.err.println( "Usage: WordToTextConverter <inputFile.doc> <saveTo.txt>" );
  83. return;
  84. }
  85. System.out.println( "Converting " + args[0] );
  86. System.out.println( "Saving output to " + args[1] );
  87. Document doc = WordToTextConverter.process( new File( args[0] ) );
  88. DOMSource domSource = new DOMSource( doc );
  89. StreamResult streamResult = new StreamResult( new File( args[1] ) );
  90. Transformer serializer = XMLHelper.newTransformer();
  91. // TODO set encoding from a command argument
  92. serializer.setOutputProperty( OutputKeys.METHOD, "text" );
  93. serializer.transform( domSource, streamResult );
  94. }
  95. private static Document process( File docFile ) throws IOException {
  96. try (final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( docFile )) {
  97. WordToTextConverter wordToTextConverter = new WordToTextConverter(
  98. XMLHelper.newDocumentBuilder().newDocument());
  99. wordToTextConverter.processDocument(wordDocument);
  100. return wordToTextConverter.getDocument();
  101. }
  102. }
  103. private final AtomicInteger noteCounters = new AtomicInteger( 1 );
  104. private Element notes;
  105. private boolean outputSummaryInformation;
  106. private final TextDocumentFacade textDocumentFacade;
  107. /**
  108. * Creates new instance of {@link WordToTextConverter}. Can be used for
  109. * output several {@link HWPFDocument}s into single text document.
  110. *
  111. */
  112. public WordToTextConverter() {
  113. this.textDocumentFacade = new TextDocumentFacade(
  114. XMLHelper.newDocumentBuilder().newDocument() );
  115. }
  116. /**
  117. * Creates new instance of {@link WordToTextConverter}. Can be used for
  118. * output several {@link HWPFDocument}s into single text document.
  119. *
  120. * @param document
  121. * XML DOM Document used as storage for text pieces
  122. */
  123. @SuppressWarnings("WeakerAccess")
  124. public WordToTextConverter(Document document )
  125. {
  126. this.textDocumentFacade = new TextDocumentFacade( document );
  127. }
  128. @SuppressWarnings("unused")
  129. public WordToTextConverter(TextDocumentFacade textDocumentFacade )
  130. {
  131. this.textDocumentFacade = textDocumentFacade;
  132. }
  133. @Override
  134. protected void afterProcess()
  135. {
  136. if ( notes != null )
  137. textDocumentFacade.getBody().appendChild( notes );
  138. }
  139. public Document getDocument()
  140. {
  141. return textDocumentFacade.getDocument();
  142. }
  143. public String getText() throws Exception
  144. {
  145. StringBuilderWriter stringWriter = new StringBuilderWriter(1024);
  146. DOMSource domSource = new DOMSource( getDocument() );
  147. StreamResult streamResult = new StreamResult( stringWriter );
  148. Transformer serializer = XMLHelper.newTransformer();
  149. // TODO set encoding from a command argument
  150. serializer.setOutputProperty( OutputKeys.METHOD, "text" );
  151. serializer.transform( domSource, streamResult );
  152. return stringWriter.toString();
  153. }
  154. @SuppressWarnings("WeakerAccess")
  155. public boolean isOutputSummaryInformation()
  156. {
  157. return outputSummaryInformation;
  158. }
  159. @Override
  160. protected void outputCharacters( Element block, CharacterRun characterRun,
  161. String text )
  162. {
  163. block.appendChild( textDocumentFacade.createText( text ) );
  164. }
  165. @Override
  166. protected void processBookmarks( HWPFDocumentCore wordDocument,
  167. Element currentBlock, Range range, int currentTableLevel,
  168. List<Bookmark> rangeBookmarks )
  169. {
  170. processCharacters( wordDocument, currentTableLevel, range, currentBlock );
  171. }
  172. @Override
  173. protected void processDocumentInformation(
  174. SummaryInformation summaryInformation )
  175. {
  176. if ( isOutputSummaryInformation() )
  177. {
  178. if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
  179. textDocumentFacade.setTitle( summaryInformation.getTitle() );
  180. if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
  181. textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
  182. if ( AbstractWordUtils
  183. .isNotEmpty( summaryInformation.getComments() ) )
  184. textDocumentFacade.addDescription( summaryInformation
  185. .getComments() );
  186. if ( AbstractWordUtils
  187. .isNotEmpty( summaryInformation.getKeywords() ) )
  188. textDocumentFacade.addKeywords( summaryInformation
  189. .getKeywords() );
  190. }
  191. }
  192. @Override
  193. public void processDocumentPart( HWPFDocumentCore wordDocument, Range range )
  194. {
  195. super.processDocumentPart( wordDocument, range );
  196. afterProcess();
  197. }
  198. @Override
  199. protected void processDrawnObject( HWPFDocument doc,
  200. CharacterRun characterRun, OfficeDrawing officeDrawing,
  201. String path, Element block )
  202. {
  203. // ignore
  204. }
  205. @Override
  206. protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
  207. int noteIndex, Element block, Range endnoteTextRange )
  208. {
  209. processNote( wordDocument, block, endnoteTextRange );
  210. }
  211. @Override
  212. protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
  213. int noteIndex, Element block, Range footnoteTextRange )
  214. {
  215. processNote( wordDocument, block, footnoteTextRange );
  216. }
  217. @Override
  218. protected void processHyperlink( HWPFDocumentCore wordDocument,
  219. Element currentBlock, Range textRange, int currentTableLevel,
  220. String hyperlink )
  221. {
  222. processCharacters( wordDocument, currentTableLevel, textRange,
  223. currentBlock );
  224. currentBlock.appendChild( textDocumentFacade.createText( " ("
  225. + UNICODECHAR_ZERO_WIDTH_SPACE
  226. + hyperlink.replace( "/", UNICODECHAR_ZERO_WIDTH_SPACE
  227. + "\\/" + UNICODECHAR_ZERO_WIDTH_SPACE )
  228. + UNICODECHAR_ZERO_WIDTH_SPACE + ")" ) );
  229. }
  230. @Override
  231. protected void processImage( Element currentBlock, boolean inlined,
  232. Picture picture )
  233. {
  234. // ignore
  235. }
  236. @Override
  237. protected void processImage( Element currentBlock, boolean inlined,
  238. Picture picture, String url )
  239. {
  240. // ignore
  241. }
  242. @Override
  243. protected void processImageWithoutPicturesManager( Element currentBlock,
  244. boolean inlined, Picture picture )
  245. {
  246. // ignore
  247. }
  248. @Override
  249. protected void processLineBreak( Element block, CharacterRun characterRun )
  250. {
  251. block.appendChild( textDocumentFacade.createText( "\n" ) );
  252. }
  253. private void processNote( HWPFDocument wordDocument, Element block, Range noteTextRange ) {
  254. final int noteIndex = noteCounters.getAndIncrement();
  255. block.appendChild( textDocumentFacade
  256. .createText( UNICODECHAR_ZERO_WIDTH_SPACE + "[" + noteIndex
  257. + "]" + UNICODECHAR_ZERO_WIDTH_SPACE ) );
  258. if ( notes == null )
  259. notes = textDocumentFacade.createBlock();
  260. Element note = textDocumentFacade.createBlock();
  261. notes.appendChild( note );
  262. // avoid StackOverflowException with very deeply nested files (mostly synthetic test files via fuzzing)
  263. // if this limit is reached in real-word documents, we can make this configurable
  264. if (note.getParentNode() != null && note.getParentNode().getChildNodes().getLength() > MAX_NESTED_CHILD_NODES) {
  265. throw new IllegalStateException("Had more than the limit of " + MAX_NESTED_CHILD_NODES + " nested child notes");
  266. }
  267. note.appendChild( textDocumentFacade.createText( "^" + noteIndex
  268. + "\t " ) );
  269. processCharacters( wordDocument, Integer.MIN_VALUE, noteTextRange, note );
  270. note.appendChild( textDocumentFacade.createText( "\n" ) );
  271. }
  272. @Override
  273. protected boolean processOle2( HWPFDocument wordDocument, Element block,
  274. Entry entry ) throws Exception
  275. {
  276. if ( !( entry instanceof DirectoryNode ) )
  277. return false;
  278. DirectoryNode directoryNode = (DirectoryNode) entry;
  279. /*
  280. * even if there is no ExtractorFactory in classpath, still support
  281. * included Word's objects
  282. */
  283. if ( directoryNode.hasEntry( "WordDocument" ) )
  284. {
  285. String text = WordToTextConverter.getText( (DirectoryNode) entry );
  286. block.appendChild( textDocumentFacade
  287. .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
  288. + UNICODECHAR_ZERO_WIDTH_SPACE ) );
  289. return true;
  290. }
  291. Object extractor;
  292. try
  293. {
  294. Class<?> cls = Class
  295. .forName( "org.apache.poi.extractor.ExtractorFactory" );
  296. Method createExtractor = cls.getMethod( "createExtractor",
  297. DirectoryNode.class );
  298. extractor = createExtractor.invoke( null, directoryNode );
  299. }
  300. catch ( Exception exc )
  301. {
  302. // no extractor in classpath
  303. LOG.atWarn().withThrowable(exc).log("There is an OLE object entry '{}', but there is no text " +
  304. "extractor for this object type or text extractor factory is not available", entry.getName());
  305. return false;
  306. }
  307. try
  308. {
  309. Method getText = extractor.getClass().getMethod( "getText" );
  310. String text = (String) getText.invoke( extractor );
  311. block.appendChild( textDocumentFacade
  312. .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
  313. + UNICODECHAR_ZERO_WIDTH_SPACE ) );
  314. return true;
  315. }
  316. catch ( Exception exc )
  317. {
  318. LOG.atError().withThrowable(exc).log("Unable to extract text from OLE entry '{}'", entry.getName());
  319. return false;
  320. }
  321. }
  322. @Override
  323. protected void processPageBreak( HWPFDocumentCore wordDocument, Element flow )
  324. {
  325. Element block = textDocumentFacade.createBlock();
  326. block.appendChild( textDocumentFacade.createText( "\n" ) );
  327. flow.appendChild( block );
  328. }
  329. @Override
  330. protected void processPageref( HWPFDocumentCore wordDocument,
  331. Element currentBlock, Range textRange, int currentTableLevel,
  332. String pageref )
  333. {
  334. processCharacters( wordDocument, currentTableLevel, textRange,
  335. currentBlock );
  336. }
  337. @Override
  338. protected void processParagraph( HWPFDocumentCore wordDocument,
  339. Element parentElement, int currentTableLevel, Paragraph paragraph,
  340. String bulletText )
  341. {
  342. Element pElement = textDocumentFacade.createParagraph();
  343. pElement.appendChild( textDocumentFacade.createText( bulletText ) );
  344. processCharacters( wordDocument, currentTableLevel, paragraph, pElement );
  345. pElement.appendChild( textDocumentFacade.createText( "\n" ) );
  346. parentElement.appendChild( pElement );
  347. }
  348. @Override
  349. protected void processSection( HWPFDocumentCore wordDocument,
  350. Section section, int s )
  351. {
  352. Element sectionElement = textDocumentFacade.createBlock();
  353. processParagraphes( wordDocument, sectionElement, section,
  354. Integer.MIN_VALUE );
  355. sectionElement.appendChild( textDocumentFacade.createText( "\n" ) );
  356. textDocumentFacade.body.appendChild( sectionElement );
  357. }
  358. protected void processTable( HWPFDocumentCore wordDocument, Element flow,
  359. Table table )
  360. {
  361. final int tableRows = table.numRows();
  362. for ( int r = 0; r < tableRows; r++ )
  363. {
  364. TableRow tableRow = table.getRow( r );
  365. Element tableRowElement = textDocumentFacade.createTableRow();
  366. final int rowCells = tableRow.numCells();
  367. for ( int c = 0; c < rowCells; c++ )
  368. {
  369. TableCell tableCell = tableRow.getCell( c );
  370. Element tableCellElement = textDocumentFacade.createTableCell();
  371. if ( c != 0 )
  372. tableCellElement.appendChild( textDocumentFacade
  373. .createText( "\t" ) );
  374. processCharacters( wordDocument, table.getTableLevel(),
  375. tableCell, tableCellElement );
  376. tableRowElement.appendChild( tableCellElement );
  377. }
  378. tableRowElement.appendChild( textDocumentFacade.createText( "\n" ) );
  379. flow.appendChild( tableRowElement );
  380. }
  381. }
  382. @SuppressWarnings("unused")
  383. public void setOutputSummaryInformation(boolean outputDocumentInformation )
  384. {
  385. this.outputSummaryInformation = outputDocumentInformation;
  386. }
  387. }