You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

WordToFoConverter.java 22KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.converter;
  16. import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;
  17. import static org.apache.poi.hwpf.converter.AbstractWordUtils.isNotEmpty;
  18. import static org.apache.poi.hwpf.converter.AbstractWordUtils.loadDoc;
  19. import static org.apache.poi.hwpf.converter.WordToFoUtils.*;
  20. import java.io.File;
  21. import java.util.ArrayList;
  22. import java.util.LinkedHashSet;
  23. import java.util.List;
  24. import java.util.Set;
  25. import java.util.concurrent.atomic.AtomicInteger;
  26. import javax.xml.parsers.DocumentBuilder;
  27. import javax.xml.transform.Transformer;
  28. import javax.xml.transform.dom.DOMSource;
  29. import javax.xml.transform.stream.StreamResult;
  30. import org.apache.poi.hpsf.SummaryInformation;
  31. import org.apache.poi.hwpf.HWPFDocument;
  32. import org.apache.poi.hwpf.HWPFDocumentCore;
  33. import org.apache.poi.hwpf.converter.FontReplacer.Triplet;
  34. import org.apache.poi.hwpf.usermodel.Bookmark;
  35. import org.apache.poi.hwpf.usermodel.CharacterRun;
  36. import org.apache.poi.hwpf.usermodel.OfficeDrawing;
  37. import org.apache.poi.hwpf.usermodel.Paragraph;
  38. import org.apache.poi.hwpf.usermodel.Picture;
  39. import org.apache.poi.hwpf.usermodel.Range;
  40. import org.apache.poi.hwpf.usermodel.Section;
  41. import org.apache.poi.hwpf.usermodel.Table;
  42. import org.apache.poi.hwpf.usermodel.TableCell;
  43. import org.apache.poi.hwpf.usermodel.TableRow;
  44. import org.apache.poi.util.Beta;
  45. import org.apache.poi.util.POILogFactory;
  46. import org.apache.poi.util.POILogger;
  47. import org.apache.poi.util.XMLHelper;
  48. import org.w3c.dom.Document;
  49. import org.w3c.dom.Element;
  50. import org.w3c.dom.Node;
  51. import org.w3c.dom.NodeList;
  52. import org.w3c.dom.Text;
  53. /**
  54. * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
  55. */
  56. @Beta
  57. public class WordToFoConverter extends AbstractWordConverter
  58. {
  59. private static final POILogger logger = POILogFactory
  60. .getLogger( WordToFoConverter.class );
  61. /**
  62. * Java main() interface to interact with {@link WordToFoConverter}
  63. *
  64. * <p>
  65. * Usage: WordToFoConverter infile outfile
  66. * </p>
  67. * Where infile is an input .doc file ( Word 97-2007) which will be rendered
  68. * as XSL-FO into outfile
  69. */
  70. public static void main( String[] args ) throws Exception {
  71. if ( args.length < 2 )
  72. {
  73. System.err.println( "Usage: WordToFoConverter <inputFile.doc> <saveTo.fo>" );
  74. return;
  75. }
  76. System.out.println( "Converting " + args[0] );
  77. System.out.println( "Saving output to " + args[1] );
  78. Document doc = WordToFoConverter.process( new File( args[0] ) );
  79. DOMSource domSource = new DOMSource( doc );
  80. StreamResult streamResult = new StreamResult( new File( args[1] ) );
  81. // TODO set encoding from a command argument
  82. Transformer serializer = XMLHelper.newTransformer();
  83. serializer.transform( domSource, streamResult );
  84. }
  85. static Document process( File docFile ) throws Exception
  86. {
  87. final DocumentBuilder docBuild = XMLHelper.newDocumentBuilder();
  88. try (final HWPFDocumentCore hwpfDocument = loadDoc( docFile )) {
  89. WordToFoConverter wordToFoConverter = new WordToFoConverter(docBuild.newDocument());
  90. wordToFoConverter.processDocument(hwpfDocument);
  91. return wordToFoConverter.getDocument();
  92. }
  93. }
  94. private List<Element> endnotes = new ArrayList<>(0);
  95. protected final FoDocumentFacade foDocumentFacade;
  96. private AtomicInteger internalLinkCounter = new AtomicInteger( 0 );
  97. private boolean outputCharactersLanguage;
  98. private Set<String> usedIds = new LinkedHashSet<>();
  99. /**
  100. * Creates new instance of {@link WordToFoConverter}. Can be used for output
  101. * several {@link HWPFDocument}s into single FO document.
  102. *
  103. * @param document
  104. * XML DOM Document used as XSL FO document. Shall support
  105. * namespaces
  106. */
  107. public WordToFoConverter( Document document )
  108. {
  109. this.foDocumentFacade = new FoDocumentFacade( document );
  110. }
  111. public WordToFoConverter( FoDocumentFacade foDocumentFacade )
  112. {
  113. this.foDocumentFacade = foDocumentFacade;
  114. }
  115. protected Element createNoteInline( String noteIndexText )
  116. {
  117. Element inline = foDocumentFacade.createInline();
  118. inline.setTextContent( noteIndexText );
  119. inline.setAttribute( "baseline-shift", "super" );
  120. inline.setAttribute( "font-size", "smaller" );
  121. return inline;
  122. }
  123. protected String createPageMaster( Section section, String type, int sectionIndex ) {
  124. float height = section.getPageHeight() / TWIPS_PER_INCH;
  125. float width = section.getPageWidth() / TWIPS_PER_INCH;
  126. float leftMargin = section.getMarginLeft() / TWIPS_PER_INCH;
  127. float rightMargin = section.getMarginRight() / TWIPS_PER_INCH;
  128. float topMargin = section.getMarginTop() / TWIPS_PER_INCH;
  129. float bottomMargin = section.getMarginBottom() / TWIPS_PER_INCH;
  130. // add these to the header
  131. String pageMasterName = type + "-page" + sectionIndex;
  132. Element pageMaster = foDocumentFacade
  133. .addSimplePageMaster( pageMasterName );
  134. pageMaster.setAttribute( "page-height", height + "in" );
  135. pageMaster.setAttribute( "page-width", width + "in" );
  136. Element regionBody = foDocumentFacade.addRegionBody( pageMaster );
  137. regionBody.setAttribute( "margin", topMargin + "in " + rightMargin
  138. + "in " + bottomMargin + "in " + leftMargin + "in" );
  139. /*
  140. * 6.4.14 fo:region-body
  141. *
  142. * The values of the padding and border-width traits must be "0".
  143. */
  144. // WordToFoUtils.setBorder(regionBody, sep.getBrcTop(), "top");
  145. // WordToFoUtils.setBorder(regionBody, sep.getBrcBottom(), "bottom");
  146. // WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left");
  147. // WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right");
  148. if ( section.getNumColumns() > 1 )
  149. {
  150. regionBody.setAttribute( "column-count",
  151. "" + ( section.getNumColumns() ) );
  152. if ( section.isColumnsEvenlySpaced() )
  153. {
  154. float distance = section.getDistanceBetweenColumns() / TWIPS_PER_INCH;
  155. regionBody.setAttribute( "column-gap", distance + "in" );
  156. }
  157. else
  158. {
  159. regionBody.setAttribute( "column-gap", "0.25in" );
  160. }
  161. }
  162. return pageMasterName;
  163. }
  164. public Document getDocument()
  165. {
  166. return foDocumentFacade.getDocument();
  167. }
  168. public boolean isOutputCharactersLanguage()
  169. {
  170. return outputCharactersLanguage;
  171. }
  172. @Override
  173. protected void outputCharacters( Element block, CharacterRun characterRun,
  174. String text )
  175. {
  176. Element inline = foDocumentFacade.createInline();
  177. Triplet triplet = getCharacterRunTriplet( characterRun );
  178. if ( isNotEmpty( triplet.fontName ) )
  179. setFontFamily( inline, triplet.fontName );
  180. setBold( inline, triplet.bold );
  181. setItalic( inline, triplet.italic );
  182. setFontSize( inline, characterRun.getFontSize() / 2 );
  183. setCharactersProperties( characterRun, inline );
  184. if ( isOutputCharactersLanguage() )
  185. setLanguage( characterRun, inline );
  186. block.appendChild( inline );
  187. Text textNode = foDocumentFacade.createText( text );
  188. inline.appendChild( textNode );
  189. }
  190. @Override
  191. protected void processBookmarks( HWPFDocumentCore wordDocument,
  192. Element currentBlock, Range range, int currentTableLevel,
  193. List<Bookmark> rangeBookmarks )
  194. {
  195. Element parent = currentBlock;
  196. for ( Bookmark bookmark : rangeBookmarks )
  197. {
  198. Element bookmarkElement = foDocumentFacade.createInline();
  199. final String idName = "bookmark_" + bookmark.getName();
  200. // make sure ID used once
  201. if ( setId( bookmarkElement, idName ) )
  202. {
  203. /*
  204. * if it just empty fo:inline without "id" attribute doesn't
  205. * making sense to add it to DOM
  206. */
  207. parent.appendChild( bookmarkElement );
  208. parent = bookmarkElement;
  209. }
  210. }
  211. if ( range != null )
  212. processCharacters( wordDocument, currentTableLevel, range, parent );
  213. }
  214. @Override
  215. protected void processDocumentInformation(
  216. SummaryInformation summaryInformation )
  217. {
  218. if ( isNotEmpty( summaryInformation.getTitle() ) )
  219. foDocumentFacade.setTitle( summaryInformation.getTitle() );
  220. if ( isNotEmpty( summaryInformation.getAuthor() ) )
  221. foDocumentFacade.setCreator( summaryInformation.getAuthor() );
  222. if ( isNotEmpty( summaryInformation.getKeywords() ) )
  223. foDocumentFacade.setKeywords( summaryInformation.getKeywords() );
  224. if ( isNotEmpty( summaryInformation.getComments() ) )
  225. foDocumentFacade.setDescription( summaryInformation.getComments() );
  226. }
  227. @Override
  228. protected void processDrawnObject( HWPFDocument doc,
  229. CharacterRun characterRun, OfficeDrawing officeDrawing,
  230. String path, Element block )
  231. {
  232. final Element externalGraphic = foDocumentFacade
  233. .createExternalGraphic( path );
  234. block.appendChild( externalGraphic );
  235. }
  236. @Override
  237. protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
  238. int noteIndex, Element block, Range endnoteTextRange )
  239. {
  240. final String textIndex = String.valueOf( internalLinkCounter
  241. .incrementAndGet() );
  242. final String forwardLinkName = "endnote_" + textIndex;
  243. final String backwardLinkName = "endnote_back_" + textIndex;
  244. Element forwardLink = foDocumentFacade
  245. .createBasicLinkInternal( forwardLinkName );
  246. forwardLink.appendChild( createNoteInline( textIndex ) );
  247. setId( forwardLink, backwardLinkName );
  248. block.appendChild( forwardLink );
  249. Element endnote = foDocumentFacade.createBlock();
  250. Element backwardLink = foDocumentFacade
  251. .createBasicLinkInternal( backwardLinkName );
  252. backwardLink.appendChild( createNoteInline( textIndex + " " ) );
  253. setId( backwardLink, forwardLinkName );
  254. endnote.appendChild( backwardLink );
  255. processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
  256. endnote );
  257. compactInlines( endnote );
  258. this.endnotes.add( endnote );
  259. }
  260. @Override
  261. protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
  262. int noteIndex, Element block, Range footnoteTextRange )
  263. {
  264. final String textIndex = String.valueOf( internalLinkCounter
  265. .incrementAndGet() );
  266. final String forwardLinkName = "footnote_" + textIndex;
  267. final String backwardLinkName = "footnote_back_" + textIndex;
  268. Element footNote = foDocumentFacade.createFootnote();
  269. block.appendChild( footNote );
  270. Element inline = foDocumentFacade.createInline();
  271. Element forwardLink = foDocumentFacade
  272. .createBasicLinkInternal( forwardLinkName );
  273. forwardLink.appendChild( createNoteInline( textIndex ) );
  274. setId( forwardLink, backwardLinkName );
  275. inline.appendChild( forwardLink );
  276. footNote.appendChild( inline );
  277. Element footnoteBody = foDocumentFacade.createFootnoteBody();
  278. Element footnoteBlock = foDocumentFacade.createBlock();
  279. Element backwardLink = foDocumentFacade
  280. .createBasicLinkInternal( backwardLinkName );
  281. backwardLink.appendChild( createNoteInline( textIndex + " " ) );
  282. setId( backwardLink, forwardLinkName );
  283. footnoteBlock.appendChild( backwardLink );
  284. footnoteBody.appendChild( footnoteBlock );
  285. footNote.appendChild( footnoteBody );
  286. processCharacters( wordDocument, Integer.MIN_VALUE, footnoteTextRange,
  287. footnoteBlock );
  288. compactInlines( footnoteBlock );
  289. }
  290. protected void processHyperlink( HWPFDocumentCore wordDocument,
  291. Element currentBlock, Range textRange, int currentTableLevel,
  292. String hyperlink )
  293. {
  294. Element basicLink = foDocumentFacade
  295. .createBasicLinkExternal( hyperlink );
  296. currentBlock.appendChild( basicLink );
  297. if ( textRange != null )
  298. processCharacters( wordDocument, currentTableLevel, textRange,
  299. basicLink );
  300. }
  301. protected void processImage( Element currentBlock, boolean inlined,
  302. Picture picture, String url )
  303. {
  304. final Element externalGraphic = foDocumentFacade
  305. .createExternalGraphic( url );
  306. setPictureProperties( picture, externalGraphic );
  307. currentBlock.appendChild( externalGraphic );
  308. }
  309. @Override
  310. protected void processImageWithoutPicturesManager( Element currentBlock,
  311. boolean inlined, Picture picture )
  312. {
  313. // no default implementation -- skip
  314. currentBlock.appendChild( foDocumentFacade.getDocument()
  315. .createComment( "Image link to '"
  316. + picture.suggestFullFileName() + "' can be here" ) );
  317. }
  318. @Override
  319. protected void processLineBreak( Element block, CharacterRun characterRun )
  320. {
  321. block.appendChild( foDocumentFacade.createBlock() );
  322. }
  323. @Override
  324. protected void processPageBreak( HWPFDocumentCore wordDocument, Element flow )
  325. {
  326. Element block = null;
  327. NodeList childNodes = flow.getChildNodes();
  328. if ( childNodes.getLength() > 0 )
  329. {
  330. Node lastChild = childNodes.item( childNodes.getLength() - 1 );
  331. if ( lastChild instanceof Element )
  332. {
  333. Element lastElement = (Element) lastChild;
  334. if ( !lastElement.hasAttribute( "break-after" ) )
  335. {
  336. block = lastElement;
  337. }
  338. }
  339. }
  340. if ( block == null )
  341. {
  342. block = foDocumentFacade.createBlock();
  343. flow.appendChild( block );
  344. }
  345. block.setAttribute( "break-after", "page" );
  346. }
  347. protected void processPageref( HWPFDocumentCore hwpfDocument,
  348. Element currentBlock, Range textRange, int currentTableLevel,
  349. String pageref )
  350. {
  351. Element basicLink = foDocumentFacade
  352. .createBasicLinkInternal( "bookmark_" + pageref );
  353. currentBlock.appendChild( basicLink );
  354. if ( textRange != null )
  355. processCharacters( hwpfDocument, currentTableLevel, textRange,
  356. basicLink );
  357. }
  358. protected void processParagraph( HWPFDocumentCore hwpfDocument,
  359. Element parentFopElement, int currentTableLevel,
  360. Paragraph paragraph, String bulletText )
  361. {
  362. final Element block = foDocumentFacade.createBlock();
  363. parentFopElement.appendChild( block );
  364. setParagraphProperties( paragraph, block );
  365. final int charRuns = paragraph.numCharacterRuns();
  366. if ( charRuns == 0 )
  367. {
  368. return;
  369. }
  370. boolean haveAnyText = false;
  371. if ( isNotEmpty( bulletText ) )
  372. {
  373. Element inline = foDocumentFacade.createInline();
  374. block.appendChild( inline );
  375. Text textNode = foDocumentFacade.createText( bulletText );
  376. inline.appendChild( textNode );
  377. haveAnyText |= bulletText.trim().length() != 0;
  378. }
  379. haveAnyText = processCharacters( hwpfDocument, currentTableLevel,
  380. paragraph, block );
  381. if ( !haveAnyText )
  382. {
  383. Element leader = foDocumentFacade.createLeader();
  384. block.appendChild( leader );
  385. }
  386. compactInlines( block );
  387. }
  388. protected void processSection( HWPFDocumentCore wordDocument,
  389. Section section, int sectionCounter )
  390. {
  391. String regularPage = createPageMaster( section, "page", sectionCounter );
  392. Element pageSequence = foDocumentFacade.addPageSequence( regularPage );
  393. Element flow = foDocumentFacade.addFlowToPageSequence( pageSequence,
  394. "xsl-region-body" );
  395. processParagraphes( wordDocument, flow, section, Integer.MIN_VALUE );
  396. if ( endnotes != null && !endnotes.isEmpty() )
  397. {
  398. for ( Element endnote : endnotes )
  399. flow.appendChild( endnote );
  400. endnotes.clear();
  401. }
  402. }
  403. protected void processTable( HWPFDocumentCore wordDocument, Element flow,
  404. Table table )
  405. {
  406. Element tableHeader = foDocumentFacade.createTableHeader();
  407. Element tableBody = foDocumentFacade.createTableBody();
  408. final int[] tableCellEdges = buildTableCellEdgesArray( table );
  409. final int tableRows = table.numRows();
  410. int maxColumns = Integer.MIN_VALUE;
  411. for ( int r = 0; r < tableRows; r++ )
  412. {
  413. maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );
  414. }
  415. for ( int r = 0; r < tableRows; r++ )
  416. {
  417. TableRow tableRow = table.getRow( r );
  418. Element tableRowElement = foDocumentFacade.createTableRow();
  419. setTableRowProperties( tableRow, tableRowElement );
  420. // index of current element in tableCellEdges[]
  421. int currentEdgeIndex = 0;
  422. final int rowCells = tableRow.numCells();
  423. for ( int c = 0; c < rowCells; c++ )
  424. {
  425. TableCell tableCell = tableRow.getCell( c );
  426. if ( tableCell.isVerticallyMerged()
  427. && !tableCell.isFirstVerticallyMerged() )
  428. {
  429. currentEdgeIndex += getNumberColumnsSpanned(
  430. tableCellEdges, currentEdgeIndex, tableCell );
  431. continue;
  432. }
  433. Element tableCellElement = foDocumentFacade.createTableCell();
  434. setTableCellProperties( tableRow, tableCell,
  435. tableCellElement, r == 0, r == tableRows - 1, c == 0,
  436. c == rowCells - 1 );
  437. int colSpan = getNumberColumnsSpanned( tableCellEdges,
  438. currentEdgeIndex, tableCell );
  439. currentEdgeIndex += colSpan;
  440. if ( colSpan == 0 )
  441. continue;
  442. if ( colSpan != 1 )
  443. tableCellElement.setAttribute( "number-columns-spanned",
  444. String.valueOf( colSpan ) );
  445. final int rowSpan = getNumberRowsSpanned( table,
  446. tableCellEdges, r, c, tableCell );
  447. if ( rowSpan > 1 )
  448. tableCellElement.setAttribute( "number-rows-spanned",
  449. String.valueOf( rowSpan ) );
  450. processParagraphes( wordDocument, tableCellElement, tableCell,
  451. table.getTableLevel() );
  452. if ( !tableCellElement.hasChildNodes() )
  453. {
  454. tableCellElement.appendChild( foDocumentFacade
  455. .createBlock() );
  456. }
  457. tableRowElement.appendChild( tableCellElement );
  458. }
  459. if ( tableRowElement.hasChildNodes() )
  460. {
  461. if ( tableRow.isTableHeader() )
  462. {
  463. tableHeader.appendChild( tableRowElement );
  464. }
  465. else
  466. {
  467. tableBody.appendChild( tableRowElement );
  468. }
  469. }
  470. }
  471. final Element tableElement = foDocumentFacade.createTable();
  472. tableElement.setAttribute( "table-layout", "fixed" );
  473. if ( tableHeader.hasChildNodes() )
  474. {
  475. tableElement.appendChild( tableHeader );
  476. }
  477. if ( tableBody.hasChildNodes() )
  478. {
  479. tableElement.appendChild( tableBody );
  480. flow.appendChild( tableElement );
  481. }
  482. else
  483. {
  484. logger.log(
  485. POILogger.WARN,
  486. "Table without body starting on offset "
  487. + table.getStartOffset() + " -- "
  488. + table.getEndOffset() );
  489. }
  490. }
  491. protected boolean setId( Element element, final String id )
  492. {
  493. // making sure ID used once
  494. if ( usedIds.contains( id ) )
  495. {
  496. logger.log( POILogger.WARN,
  497. "Tried to create element with same ID '", id, "'. Skipped" );
  498. return false;
  499. }
  500. element.setAttribute( "id", id );
  501. usedIds.add( id );
  502. return true;
  503. }
  504. public void setOutputCharactersLanguage( boolean outputCharactersLanguage )
  505. {
  506. this.outputCharactersLanguage = outputCharactersLanguage;
  507. }
  508. }