You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

WordToTextConverter.java 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.converter;
  16. import java.io.File;
  17. import java.io.IOException;
  18. import java.io.StringWriter;
  19. import java.lang.reflect.Method;
  20. import java.util.List;
  21. import java.util.concurrent.atomic.AtomicInteger;
  22. import javax.xml.parsers.DocumentBuilder;
  23. import javax.xml.parsers.DocumentBuilderFactory;
  24. import javax.xml.parsers.ParserConfigurationException;
  25. import javax.xml.transform.OutputKeys;
  26. import javax.xml.transform.Transformer;
  27. import javax.xml.transform.TransformerFactory;
  28. import javax.xml.transform.dom.DOMSource;
  29. import javax.xml.transform.stream.StreamResult;
  30. import org.apache.poi.hpsf.SummaryInformation;
  31. import org.apache.poi.hwpf.HWPFDocument;
  32. import org.apache.poi.hwpf.HWPFDocumentCore;
  33. import org.apache.poi.hwpf.usermodel.Bookmark;
  34. import org.apache.poi.hwpf.usermodel.CharacterRun;
  35. import org.apache.poi.hwpf.usermodel.OfficeDrawing;
  36. import org.apache.poi.hwpf.usermodel.Paragraph;
  37. import org.apache.poi.hwpf.usermodel.Picture;
  38. import org.apache.poi.hwpf.usermodel.Range;
  39. import org.apache.poi.hwpf.usermodel.Section;
  40. import org.apache.poi.hwpf.usermodel.Table;
  41. import org.apache.poi.hwpf.usermodel.TableCell;
  42. import org.apache.poi.hwpf.usermodel.TableRow;
  43. import org.apache.poi.poifs.filesystem.DirectoryNode;
  44. import org.apache.poi.poifs.filesystem.Entry;
  45. import org.apache.poi.util.Beta;
  46. import org.apache.poi.util.POILogFactory;
  47. import org.apache.poi.util.POILogger;
  48. import org.apache.poi.util.XMLHelper;
  49. import org.w3c.dom.Document;
  50. import org.w3c.dom.Element;
  51. @Beta
  52. public class WordToTextConverter extends AbstractWordConverter
  53. {
  54. private static final POILogger logger = POILogFactory
  55. .getLogger( WordToTextConverter.class );
  56. public static String getText( DirectoryNode root ) throws Exception
  57. {
  58. final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
  59. return getText( wordDocument );
  60. }
  61. public static String getText( File docFile ) throws Exception
  62. {
  63. final HWPFDocumentCore wordDocument = AbstractWordUtils
  64. .loadDoc( docFile );
  65. return getText( wordDocument );
  66. }
  67. public static String getText( final HWPFDocumentCore wordDocument )
  68. throws Exception
  69. {
  70. WordToTextConverter wordToTextConverter = new WordToTextConverter(
  71. XMLHelper.getDocumentBuilderFactory().newDocumentBuilder().newDocument() );
  72. wordToTextConverter.processDocument( wordDocument );
  73. return wordToTextConverter.getText();
  74. }
  75. /**
  76. * Java main() interface to interact with {@link WordToTextConverter}
  77. *
  78. * <p>
  79. * Usage: WordToTextConverter infile outfile
  80. * </p>
  81. * Where infile is an input .doc file ( Word 95-2007) which will be rendered
  82. * as plain text into outfile
  83. */
  84. public static void main( String[] args ) throws Exception {
  85. if ( args.length < 2 )
  86. {
  87. System.err.println( "Usage: WordToTextConverter <inputFile.doc> <saveTo.txt>" );
  88. return;
  89. }
  90. System.out.println( "Converting " + args[0] );
  91. System.out.println( "Saving output to " + args[1] );
  92. Document doc = WordToTextConverter.process( new File( args[0] ) );
  93. DOMSource domSource = new DOMSource( doc );
  94. StreamResult streamResult = new StreamResult( new File( args[1] ) );
  95. TransformerFactory tf = TransformerFactory.newInstance();
  96. Transformer serializer = tf.newTransformer();
  97. // TODO set encoding from a command argument
  98. serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
  99. serializer.setOutputProperty( OutputKeys.INDENT, "no" );
  100. serializer.setOutputProperty( OutputKeys.METHOD, "text" );
  101. serializer.transform( domSource, streamResult );
  102. }
  103. private static Document process( File docFile ) throws IOException, ParserConfigurationException {
  104. try (final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( docFile )) {
  105. WordToTextConverter wordToTextConverter = new WordToTextConverter(
  106. XMLHelper.getDocumentBuilderFactory().newDocumentBuilder().newDocument());
  107. wordToTextConverter.processDocument(wordDocument);
  108. return wordToTextConverter.getDocument();
  109. }
  110. }
  111. private AtomicInteger noteCounters = new AtomicInteger( 1 );
  112. private Element notes;
  113. private boolean outputSummaryInformation;
  114. private final TextDocumentFacade textDocumentFacade;
  115. /**
  116. * Creates new instance of {@link WordToTextConverter}. Can be used for
  117. * output several {@link HWPFDocument}s into single text document.
  118. *
  119. * @throws ParserConfigurationException
  120. * if an internal {@link DocumentBuilder} cannot be created
  121. */
  122. public WordToTextConverter() throws ParserConfigurationException
  123. {
  124. this.textDocumentFacade = new TextDocumentFacade(
  125. DocumentBuilderFactory.newInstance().newDocumentBuilder()
  126. .newDocument() );
  127. }
  128. /**
  129. * Creates new instance of {@link WordToTextConverter}. Can be used for
  130. * output several {@link HWPFDocument}s into single text document.
  131. *
  132. * @param document
  133. * XML DOM Document used as storage for text pieces
  134. */
  135. @SuppressWarnings("WeakerAccess")
  136. public WordToTextConverter(Document document )
  137. {
  138. this.textDocumentFacade = new TextDocumentFacade( document );
  139. }
  140. @SuppressWarnings("unused")
  141. public WordToTextConverter(TextDocumentFacade textDocumentFacade )
  142. {
  143. this.textDocumentFacade = textDocumentFacade;
  144. }
  145. @Override
  146. protected void afterProcess()
  147. {
  148. if ( notes != null )
  149. textDocumentFacade.getBody().appendChild( notes );
  150. }
  151. public Document getDocument()
  152. {
  153. return textDocumentFacade.getDocument();
  154. }
  155. public String getText() throws Exception
  156. {
  157. StringWriter stringWriter = new StringWriter();
  158. DOMSource domSource = new DOMSource( getDocument() );
  159. StreamResult streamResult = new StreamResult( stringWriter );
  160. TransformerFactory tf = TransformerFactory.newInstance();
  161. Transformer serializer = tf.newTransformer();
  162. // TODO set encoding from a command argument
  163. serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
  164. serializer.setOutputProperty( OutputKeys.INDENT, "no" );
  165. serializer.setOutputProperty( OutputKeys.METHOD, "text" );
  166. serializer.transform( domSource, streamResult );
  167. return stringWriter.toString();
  168. }
  169. @SuppressWarnings("WeakerAccess")
  170. public boolean isOutputSummaryInformation()
  171. {
  172. return outputSummaryInformation;
  173. }
  174. @Override
  175. protected void outputCharacters( Element block, CharacterRun characterRun,
  176. String text )
  177. {
  178. block.appendChild( textDocumentFacade.createText( text ) );
  179. }
  180. @Override
  181. protected void processBookmarks( HWPFDocumentCore wordDocument,
  182. Element currentBlock, Range range, int currentTableLevel,
  183. List<Bookmark> rangeBookmarks )
  184. {
  185. processCharacters( wordDocument, currentTableLevel, range, currentBlock );
  186. }
  187. @Override
  188. protected void processDocumentInformation(
  189. SummaryInformation summaryInformation )
  190. {
  191. if ( isOutputSummaryInformation() )
  192. {
  193. if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
  194. textDocumentFacade.setTitle( summaryInformation.getTitle() );
  195. if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
  196. textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
  197. if ( AbstractWordUtils
  198. .isNotEmpty( summaryInformation.getComments() ) )
  199. textDocumentFacade.addDescription( summaryInformation
  200. .getComments() );
  201. if ( AbstractWordUtils
  202. .isNotEmpty( summaryInformation.getKeywords() ) )
  203. textDocumentFacade.addKeywords( summaryInformation
  204. .getKeywords() );
  205. }
  206. }
  207. @Override
  208. public void processDocumentPart( HWPFDocumentCore wordDocument, Range range )
  209. {
  210. super.processDocumentPart( wordDocument, range );
  211. afterProcess();
  212. }
  213. @Override
  214. protected void processDrawnObject( HWPFDocument doc,
  215. CharacterRun characterRun, OfficeDrawing officeDrawing,
  216. String path, Element block )
  217. {
  218. // ignore
  219. }
  220. @Override
  221. protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
  222. int noteIndex, Element block, Range endnoteTextRange )
  223. {
  224. processNote( wordDocument, block, endnoteTextRange );
  225. }
  226. @Override
  227. protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
  228. int noteIndex, Element block, Range footnoteTextRange )
  229. {
  230. processNote( wordDocument, block, footnoteTextRange );
  231. }
  232. @Override
  233. protected void processHyperlink( HWPFDocumentCore wordDocument,
  234. Element currentBlock, Range textRange, int currentTableLevel,
  235. String hyperlink )
  236. {
  237. processCharacters( wordDocument, currentTableLevel, textRange,
  238. currentBlock );
  239. currentBlock.appendChild( textDocumentFacade.createText( " ("
  240. + UNICODECHAR_ZERO_WIDTH_SPACE
  241. + hyperlink.replaceAll( "/", UNICODECHAR_ZERO_WIDTH_SPACE
  242. + "\\/" + UNICODECHAR_ZERO_WIDTH_SPACE )
  243. + UNICODECHAR_ZERO_WIDTH_SPACE + ")" ) );
  244. }
  245. @Override
  246. protected void processImage( Element currentBlock, boolean inlined,
  247. Picture picture )
  248. {
  249. // ignore
  250. }
  251. @Override
  252. protected void processImage( Element currentBlock, boolean inlined,
  253. Picture picture, String url )
  254. {
  255. // ignore
  256. }
  257. @Override
  258. protected void processImageWithoutPicturesManager( Element currentBlock,
  259. boolean inlined, Picture picture )
  260. {
  261. // ignore
  262. }
  263. @Override
  264. protected void processLineBreak( Element block, CharacterRun characterRun )
  265. {
  266. block.appendChild( textDocumentFacade.createText( "\n" ) );
  267. }
  268. private void processNote( HWPFDocument wordDocument, Element block, Range noteTextRange ) {
  269. final int noteIndex = noteCounters.getAndIncrement();
  270. block.appendChild( textDocumentFacade
  271. .createText( UNICODECHAR_ZERO_WIDTH_SPACE + "[" + noteIndex
  272. + "]" + UNICODECHAR_ZERO_WIDTH_SPACE ) );
  273. if ( notes == null )
  274. notes = textDocumentFacade.createBlock();
  275. Element note = textDocumentFacade.createBlock();
  276. notes.appendChild( note );
  277. note.appendChild( textDocumentFacade.createText( "^" + noteIndex
  278. + "\t " ) );
  279. processCharacters( wordDocument, Integer.MIN_VALUE, noteTextRange, note );
  280. note.appendChild( textDocumentFacade.createText( "\n" ) );
  281. }
  282. @Override
  283. protected boolean processOle2( HWPFDocument wordDocument, Element block,
  284. Entry entry ) throws Exception
  285. {
  286. if ( !( entry instanceof DirectoryNode ) )
  287. return false;
  288. DirectoryNode directoryNode = (DirectoryNode) entry;
  289. /*
  290. * even if there is no ExtractorFactory in classpath, still support
  291. * included Word's objects
  292. */
  293. if ( directoryNode.hasEntry( "WordDocument" ) )
  294. {
  295. String text = WordToTextConverter.getText( (DirectoryNode) entry );
  296. block.appendChild( textDocumentFacade
  297. .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
  298. + UNICODECHAR_ZERO_WIDTH_SPACE ) );
  299. return true;
  300. }
  301. Object extractor;
  302. try
  303. {
  304. Class<?> cls = Class
  305. .forName( "org.apache.poi.extractor.ExtractorFactory" );
  306. Method createExtractor = cls.getMethod( "createExtractor",
  307. DirectoryNode.class );
  308. extractor = createExtractor.invoke( null, directoryNode );
  309. }
  310. catch ( Exception exc )
  311. {
  312. // no extractor in classpath
  313. logger.log( POILogger.WARN, "There is an OLE object entry '",
  314. entry.getName(),
  315. "', but there is no text extractor for this object type ",
  316. "or text extractor factory is not available: ", "" + exc );
  317. return false;
  318. }
  319. try
  320. {
  321. Method getText = extractor.getClass().getMethod( "getText" );
  322. String text = (String) getText.invoke( extractor );
  323. block.appendChild( textDocumentFacade
  324. .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
  325. + UNICODECHAR_ZERO_WIDTH_SPACE ) );
  326. return true;
  327. }
  328. catch ( Exception exc )
  329. {
  330. logger.log( POILogger.ERROR,
  331. "Unable to extract text from OLE entry '", entry.getName(),
  332. "': ", exc, exc );
  333. return false;
  334. }
  335. }
  336. @Override
  337. protected void processPageBreak( HWPFDocumentCore wordDocument, Element flow )
  338. {
  339. Element block = textDocumentFacade.createBlock();
  340. block.appendChild( textDocumentFacade.createText( "\n" ) );
  341. flow.appendChild( block );
  342. }
  343. @Override
  344. protected void processPageref( HWPFDocumentCore wordDocument,
  345. Element currentBlock, Range textRange, int currentTableLevel,
  346. String pageref )
  347. {
  348. processCharacters( wordDocument, currentTableLevel, textRange,
  349. currentBlock );
  350. }
  351. @Override
  352. protected void processParagraph( HWPFDocumentCore wordDocument,
  353. Element parentElement, int currentTableLevel, Paragraph paragraph,
  354. String bulletText )
  355. {
  356. Element pElement = textDocumentFacade.createParagraph();
  357. pElement.appendChild( textDocumentFacade.createText( bulletText ) );
  358. processCharacters( wordDocument, currentTableLevel, paragraph, pElement );
  359. pElement.appendChild( textDocumentFacade.createText( "\n" ) );
  360. parentElement.appendChild( pElement );
  361. }
  362. @Override
  363. protected void processSection( HWPFDocumentCore wordDocument,
  364. Section section, int s )
  365. {
  366. Element sectionElement = textDocumentFacade.createBlock();
  367. processParagraphes( wordDocument, sectionElement, section,
  368. Integer.MIN_VALUE );
  369. sectionElement.appendChild( textDocumentFacade.createText( "\n" ) );
  370. textDocumentFacade.body.appendChild( sectionElement );
  371. }
  372. protected void processTable( HWPFDocumentCore wordDocument, Element flow,
  373. Table table )
  374. {
  375. final int tableRows = table.numRows();
  376. for ( int r = 0; r < tableRows; r++ )
  377. {
  378. TableRow tableRow = table.getRow( r );
  379. Element tableRowElement = textDocumentFacade.createTableRow();
  380. final int rowCells = tableRow.numCells();
  381. for ( int c = 0; c < rowCells; c++ )
  382. {
  383. TableCell tableCell = tableRow.getCell( c );
  384. Element tableCellElement = textDocumentFacade.createTableCell();
  385. if ( c != 0 )
  386. tableCellElement.appendChild( textDocumentFacade
  387. .createText( "\t" ) );
  388. processCharacters( wordDocument, table.getTableLevel(),
  389. tableCell, tableCellElement );
  390. tableRowElement.appendChild( tableCellElement );
  391. }
  392. tableRowElement.appendChild( textDocumentFacade.createText( "\n" ) );
  393. flow.appendChild( tableRowElement );
  394. }
  395. }
  396. @SuppressWarnings("unused")
  397. public void setOutputSummaryInformation(boolean outputDocumentInformation )
  398. {
  399. this.outputSummaryInformation = outputDocumentInformation;
  400. }
  401. }