From: Sergey Vladimirov Date: Tue, 9 Aug 2011 09:25:59 +0000 (+0000) Subject: add Word-to-Text converter X-Git-Tag: REL_3_8_BETA4~25 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=888f51c566faabe689cc7f3e110fc6fa67514a59;p=poi.git add Word-to-Text converter git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155281 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java index 3612011dd5..e71aed0ac6 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java @@ -71,9 +71,9 @@ public abstract class AbstractWordConverter private static final byte SPECCHAR_DRAWN_OBJECT = 8; - private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011'; + protected static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011'; - private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b'; + protected static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b'; private static void addToStructures( List structures, Structure structure ) @@ -205,7 +205,7 @@ public abstract class AbstractWordConverter Element currentBlock, Range range, int currentTableLevel, List rangeBookmarks ); - protected boolean processCharacters( final HWPFDocumentCore document, + protected boolean processCharacters( final HWPFDocumentCore wordDocument, final int currentTableLevel, final Range range, final Element block ) { if ( range == null ) @@ -220,9 +220,9 @@ public abstract class AbstractWordConverter * reconstruct the structure of range -- sergey */ List structures = new LinkedList(); - if ( document instanceof HWPFDocument ) + if ( wordDocument instanceof HWPFDocument ) { - final HWPFDocument doc = (HWPFDocument) document; + final HWPFDocument doc = (HWPFDocument) wordDocument; Map> rangeBookmarks = doc.getBookmarks() .getBookmarksStartedBetween( range.getStartOffset(), @@ -247,7 +247,7 @@ public abstract class AbstractWordConverter CharacterRun characterRun = range.getCharacterRun( c ); if ( characterRun == null ) throw new AssertionError(); - Field aliveField = ( (HWPFDocument) document ).getFields() + Field aliveField = ( (HWPFDocument) wordDocument ).getFields() .getFieldByStartOffset( FieldsDocumentPart.MAIN, characterRun.getStartOffset() ); if ( aliveField != null ) @@ -273,14 +273,15 @@ public abstract class AbstractWordConverter return "BetweenStructuresSubrange " + super.toString(); } }; - processCharacters( document, currentTableLevel, subrange, block ); + processCharacters( wordDocument, currentTableLevel, subrange, + block ); } if ( structure.structure instanceof Bookmark ) { // other bookmarks with same bundaries List bookmarks = new LinkedList(); - for ( Bookmark bookmark : ( (HWPFDocument) document ) + for ( Bookmark bookmark : ( (HWPFDocument) wordDocument ) .getBookmarks() .getBookmarksStartedBetween( structure.start, structure.start + 1 ).values().iterator() @@ -306,7 +307,7 @@ public abstract class AbstractWordConverter } }; - processBookmarks( document, block, subrange, + processBookmarks( wordDocument, block, subrange, currentTableLevel, bookmarks ); } finally @@ -317,7 +318,7 @@ public abstract class AbstractWordConverter else if ( structure.structure instanceof Field ) { Field field = (Field) structure.structure; - processField( (HWPFDocument) document, range, + processField( (HWPFDocument) wordDocument, range, currentTableLevel, field, block ); } else @@ -349,7 +350,8 @@ public abstract class AbstractWordConverter return "AfterStructureSubrange " + super.toString(); } }; - processCharacters( document, currentTableLevel, subrange, block ); + processCharacters( wordDocument, currentTableLevel, subrange, + block ); } return true; } @@ -361,11 +363,11 @@ public abstract class AbstractWordConverter if ( characterRun == null ) throw new AssertionError(); - if ( document instanceof HWPFDocument - && ( (HWPFDocument) document ).getPicturesTable() + if ( wordDocument instanceof HWPFDocument + && ( (HWPFDocument) wordDocument ).getPicturesTable() .hasPicture( characterRun ) ) { - HWPFDocument newFormat = (HWPFDocument) document; + HWPFDocument newFormat = (HWPFDocument) wordDocument; Picture picture = newFormat.getPicturesTable().extractPicture( characterRun, true ); @@ -381,16 +383,16 @@ public abstract class AbstractWordConverter if ( characterRun.isSpecialCharacter() ) { if ( text.charAt( 0 ) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE - && ( document instanceof HWPFDocument ) ) + && ( wordDocument instanceof HWPFDocument ) ) { - HWPFDocument doc = (HWPFDocument) document; + HWPFDocument doc = (HWPFDocument) wordDocument; processNoteAnchor( doc, characterRun, block ); continue; } if ( text.charAt( 0 ) == SPECCHAR_DRAWN_OBJECT - && ( document instanceof HWPFDocument ) ) + && ( wordDocument instanceof HWPFDocument ) ) { - HWPFDocument doc = (HWPFDocument) document; + HWPFDocument doc = (HWPFDocument) wordDocument; processDrawnObject( doc, characterRun, block ); continue; } @@ -398,14 +400,15 @@ public abstract class AbstractWordConverter if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) { - if ( document instanceof HWPFDocument ) + if ( wordDocument instanceof HWPFDocument ) { - Field aliveField = ( (HWPFDocument) document ).getFields() - .getFieldByStartOffset( FieldsDocumentPart.MAIN, + Field aliveField = ( (HWPFDocument) wordDocument ) + .getFields().getFieldByStartOffset( + FieldsDocumentPart.MAIN, characterRun.getStartOffset() ); if ( aliveField != null ) { - processField( ( (HWPFDocument) document ), range, + processField( ( (HWPFDocument) wordDocument ), range, currentTableLevel, aliveField, block ); int continueAfter = aliveField.getFieldEndOffset(); @@ -420,8 +423,8 @@ public abstract class AbstractWordConverter } } - int skipTo = tryDeadField( document, range, currentTableLevel, - c, block ); + int skipTo = tryDeadField( wordDocument, range, + currentTableLevel, c, block ); if ( skipTo != c ) { @@ -610,7 +613,7 @@ public abstract class AbstractWordConverter CharacterRun characterRun, OfficeDrawing officeDrawing, String path, Element block ); - protected abstract void processEndnoteAutonumbered( HWPFDocument doc, + protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex, Element block, Range endnoteTextRange ); protected void processField( HWPFDocument hwpfDocument, Range parentRange, @@ -666,7 +669,7 @@ public abstract class AbstractWordConverter field.secondSubrange( parentRange ), currentBlock ); } - protected abstract void processFootnoteAutonumbered( HWPFDocument doc, + protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument, int noteIndex, Element block, Range footnoteTextRange ); protected abstract void processHyperlink( HWPFDocumentCore wordDocument, @@ -734,8 +737,8 @@ public abstract class AbstractWordConverter String pageref ); protected abstract void processParagraph( HWPFDocumentCore wordDocument, - Element parentFopElement, int currentTableLevel, - Paragraph paragraph, String bulletText ); + Element parentElement, int currentTableLevel, Paragraph paragraph, + String bulletText ); protected void processParagraphes( HWPFDocumentCore wordDocument, Element flow, Range range, int currentTableLevel ) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/TextDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/TextDocumentFacade.java new file mode 100644 index 0000000000..4eb352cf56 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/TextDocumentFacade.java @@ -0,0 +1,179 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import org.apache.poi.util.Beta; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +@Beta +public class TextDocumentFacade +{ + protected final Element body; + protected final Document document; + protected final Element head; + protected final Element root; + + protected Element title; + protected Text titleText; + + public TextDocumentFacade( Document document ) + { + this.document = document; + + root = document.createElement( "html" ); + document.appendChild( root ); + + body = document.createElement( "body" ); + head = document.createElement( "head" ); + + root.appendChild( head ); + root.appendChild( body ); + + title = document.createElement( "title" ); + titleText = document.createTextNode( "" ); + head.appendChild( title ); + } + + public void addAuthor( String value ) + { + addMeta( "Author", value ); + } + + public void addDescription( String value ) + { + addMeta( "Description", value ); + } + + public void addKeywords( String value ) + { + addMeta( "Keywords", value ); + } + + public void addMeta( final String name, String value ) + { + Element meta = document.createElement( "meta" ); + + Element metaName = document.createElement( "name" ); + metaName.appendChild( document.createTextNode( name + ": " ) ); + meta.appendChild( metaName ); + + Element metaValue = document.createElement( "value" ); + metaValue.appendChild( document.createTextNode( value + "\n" ) ); + meta.appendChild( metaValue ); + + head.appendChild( meta ); + } + + public Element createBlock() + { + return document.createElement( "div" ); + } + + public Element createHeader1() + { + Element result = document.createElement( "h1" ); + result.appendChild( document.createTextNode( " " ) ); + return result; + } + + public Element createHeader2() + { + Element result = document.createElement( "h2" ); + result.appendChild( document.createTextNode( " " ) ); + return result; + } + + public Element createParagraph() + { + return document.createElement( "p" ); + } + + public Element createTable() + { + return document.createElement( "table" ); + } + + public Element createTableBody() + { + return document.createElement( "tbody" ); + } + + public Element createTableCell() + { + return document.createElement( "td" ); + } + + public Element createTableRow() + { + return document.createElement( "tr" ); + } + + public Text createText( String data ) + { + return document.createTextNode( data ); + } + + public Element createUnorderedList() + { + return document.createElement( "ul" ); + } + + public Element getBody() + { + return body; + } + + public Document getDocument() + { + return document; + } + + public Element getHead() + { + return head; + } + + public String getTitle() + { + if ( title == null ) + return null; + + return titleText.getTextContent(); + } + + public void setTitle( String titleText ) + { + if ( WordToHtmlUtils.isEmpty( titleText ) && this.title != null ) + { + this.head.removeChild( this.title ); + this.title = null; + this.titleText = null; + } + + if ( this.title == null ) + { + this.title = document.createElement( "title" ); + this.titleText = document.createTextNode( titleText ); + this.title.appendChild( this.titleText ); + this.head.appendChild( title ); + } + + this.titleText.setData( titleText ); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java index 2bf9dc55fe..dd9cfe8824 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java @@ -276,7 +276,7 @@ public class WordToFoConverter extends AbstractWordConverter } @Override - protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex, + protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex, Element block, Range endnoteTextRange ) { final String textIndex = String.valueOf( internalLinkCounter @@ -297,14 +297,14 @@ public class WordToFoConverter extends AbstractWordConverter setId( backwardLink, forwardLinkName ); endnote.appendChild( backwardLink ); - processCharacters( doc, Integer.MIN_VALUE, endnoteTextRange, endnote ); + processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote ); WordToFoUtils.compactInlines( endnote ); this.endnotes.add( endnote ); } @Override - protected void processFootnoteAutonumbered( HWPFDocument doc, + protected void processFootnoteAutonumbered( HWPFDocument wordDocument, int noteIndex, Element block, Range footnoteTextRange ) { final String textIndex = String.valueOf( internalLinkCounter @@ -333,7 +333,7 @@ public class WordToFoConverter extends AbstractWordConverter footnoteBody.appendChild( footnoteBlock ); footNote.appendChild( footnoteBody ); - processCharacters( doc, Integer.MIN_VALUE, footnoteTextRange, + processCharacters( wordDocument, Integer.MIN_VALUE, footnoteTextRange, footnoteBlock ); WordToFoUtils.compactInlines( footnoteBlock ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java index a0af195590..040d32a879 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java @@ -282,17 +282,17 @@ public class WordToHtmlConverter extends AbstractWordConverter } @Override - protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex, + protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex, Element block, Range endnoteTextRange ) { - processNoteAutonumbered( doc, "end", noteIndex, block, endnoteTextRange ); + processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange ); } @Override - protected void processFootnoteAutonumbered( HWPFDocument doc, + protected void processFootnoteAutonumbered( HWPFDocument wordDocument, int noteIndex, Element block, Range footnoteTextRange ) { - processNoteAutonumbered( doc, "foot", noteIndex, block, + processNoteAutonumbered( wordDocument, "foot", noteIndex, block, footnoteTextRange ); } @@ -508,11 +508,11 @@ public class WordToHtmlConverter extends AbstractWordConverter } protected void processParagraph( HWPFDocumentCore hwpfDocument, - Element parentFopElement, int currentTableLevel, - Paragraph paragraph, String bulletText ) + Element parentElement, int currentTableLevel, Paragraph paragraph, + String bulletText ) { final Element pElement = htmlDocumentFacade.createParagraph(); - parentFopElement.appendChild( pElement ); + parentElement.appendChild( pElement ); StringBuilder style = new StringBuilder(); WordToHtmlUtils.addParagraphProperties( paragraph, style ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java new file mode 100644 index 0000000000..edea5dab6c --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java @@ -0,0 +1,288 @@ +package org.apache.poi.hwpf.converter; + +import java.io.File; +import java.io.FileWriter; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; +import org.apache.poi.hwpf.usermodel.Bookmark; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.OfficeDrawing; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.usermodel.Section; +import org.apache.poi.hwpf.usermodel.Table; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.util.Beta; +import org.w3c.dom.Document; +import org.w3c.dom.Element; + +@Beta +public class WordToTextConverter extends AbstractWordConverter +{ + + /** + * Java main() interface to interact with {@link WordToTextConverter} + * + *

+ * Usage: WordToTextConverter infile outfile + *

+ * Where infile is an input .doc file ( Word 95-2007) which will be rendered + * as plain text into outfile + */ + public static void main( String[] args ) + { + if ( args.length < 2 ) + { + System.err + .println( "Usage: WordToTextConverter " ); + return; + } + + System.out.println( "Converting " + args[0] ); + System.out.println( "Saving output to " + args[1] ); + try + { + Document doc = WordToTextConverter.process( new File( args[0] ) ); + + FileWriter out = new FileWriter( args[1] ); + DOMSource domSource = new DOMSource( doc ); + StreamResult streamResult = new StreamResult( out ); + + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer serializer = tf.newTransformer(); + // TODO set encoding from a command argument + serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" ); + serializer.setOutputProperty( OutputKeys.INDENT, "no" ); + serializer.setOutputProperty( OutputKeys.METHOD, "text" ); + serializer.transform( domSource, streamResult ); + out.close(); + } + catch ( Exception e ) + { + e.printStackTrace(); + } + } + + static Document process( File docFile ) throws Exception + { + final HWPFDocumentCore wordDocument = AbstractWordUtils + .loadDoc( docFile ); + WordToTextConverter wordToTextConverter = new WordToTextConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToTextConverter.processDocument( wordDocument ); + return wordToTextConverter.getDocument(); + } + + private AtomicInteger noteCounters = new AtomicInteger( 1 ); + + private Element notes = null; + + private final TextDocumentFacade textDocumentFacade; + + /** + * Creates new instance of {@link WordToTextConverter}. Can be used for + * output several {@link HWPFDocument}s into single text document. + * + * @param document + * XML DOM Document used as storage for text pieces + */ + public WordToTextConverter( Document document ) + { + this.textDocumentFacade = new TextDocumentFacade( document ); + } + + public Document getDocument() + { + return textDocumentFacade.getDocument(); + } + + @Override + protected void outputCharacters( Element block, CharacterRun characterRun, + String text ) + { + block.appendChild( textDocumentFacade.createText( text ) ); + } + + @Override + protected void processBookmarks( HWPFDocumentCore wordDocument, + Element currentBlock, Range range, int currentTableLevel, + List rangeBookmarks ) + { + processCharacters( wordDocument, currentTableLevel, range, currentBlock ); + } + + @Override + public void processDocument( HWPFDocumentCore wordDocument ) + { + super.processDocument( wordDocument ); + + if ( notes != null ) + textDocumentFacade.getBody().appendChild( notes ); + } + + @Override + protected void processDocumentInformation( + SummaryInformation summaryInformation ) + { + if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) ) + textDocumentFacade.setTitle( summaryInformation.getTitle() ); + + if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) ) + textDocumentFacade.addAuthor( summaryInformation.getAuthor() ); + + if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) ) + textDocumentFacade + .addDescription( summaryInformation.getComments() ); + + if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) ) + textDocumentFacade.addKeywords( summaryInformation.getKeywords() ); + } + + @Override + protected void processDrawnObject( HWPFDocument doc, + CharacterRun characterRun, OfficeDrawing officeDrawing, + String path, Element block ) + { + // ignore + } + + @Override + protected void processEndnoteAutonumbered( HWPFDocument wordDocument, + int noteIndex, Element block, Range endnoteTextRange ) + { + processNote( wordDocument, block, endnoteTextRange ); + } + + @Override + protected void processFootnoteAutonumbered( HWPFDocument wordDocument, + int noteIndex, Element block, Range footnoteTextRange ) + { + processNote( wordDocument, block, footnoteTextRange ); + } + + @Override + protected void processHyperlink( HWPFDocumentCore wordDocument, + Element currentBlock, Range textRange, int currentTableLevel, + String hyperlink ) + { + processCharacters( wordDocument, currentTableLevel, textRange, + currentBlock ); + + currentBlock.appendChild( textDocumentFacade.createText( " (" + + UNICODECHAR_ZERO_WIDTH_SPACE + + hyperlink.replaceAll( "\\/", UNICODECHAR_ZERO_WIDTH_SPACE + + "\\/" + UNICODECHAR_ZERO_WIDTH_SPACE ) + + UNICODECHAR_ZERO_WIDTH_SPACE + ")" ) ); + } + + @Override + protected void processImage( Element currentBlock, boolean inlined, + Picture picture ) + { + // ignore + } + + @Override + protected void processLineBreak( Element block, CharacterRun characterRun ) + { + block.appendChild( textDocumentFacade.createText( "\n" ) ); + } + + protected void processNote( HWPFDocument wordDocument, Element block, + Range noteTextRange ) + { + final int noteIndex = noteCounters.getAndIncrement(); + block.appendChild( textDocumentFacade + .createText( UNICODECHAR_ZERO_WIDTH_SPACE + "[" + noteIndex + + "]" + UNICODECHAR_ZERO_WIDTH_SPACE ) ); + + if ( notes == null ) + notes = textDocumentFacade.createBlock(); + + Element note = textDocumentFacade.createBlock(); + notes.appendChild( note ); + + note.appendChild( textDocumentFacade.createText( "^" + noteIndex + + "\t " ) ); + processCharacters( wordDocument, Integer.MIN_VALUE, noteTextRange, note ); + note.appendChild( textDocumentFacade.createText( "\n" ) ); + } + + @Override + protected void processPageref( HWPFDocumentCore wordDocument, + Element currentBlock, Range textRange, int currentTableLevel, + String pageref ) + { + processCharacters( wordDocument, currentTableLevel, textRange, + currentBlock ); + } + + @Override + protected void processParagraph( HWPFDocumentCore wordDocument, + Element parentElement, int currentTableLevel, Paragraph paragraph, + String bulletText ) + { + Element pElement = textDocumentFacade.createParagraph(); + pElement.appendChild( textDocumentFacade.createText( bulletText ) ); + processCharacters( wordDocument, currentTableLevel, paragraph, pElement ); + pElement.appendChild( textDocumentFacade.createText( "\n" ) ); + parentElement.appendChild( pElement ); + } + + @Override + protected void processSection( HWPFDocumentCore wordDocument, + Section section, int s ) + { + Element sectionElement = textDocumentFacade.createBlock(); + processParagraphes( wordDocument, sectionElement, section, + Integer.MIN_VALUE ); + sectionElement.appendChild( textDocumentFacade.createText( "\n" ) ); + textDocumentFacade.body.appendChild( sectionElement ); + } + + protected void processTable( HWPFDocumentCore hwpfDocument, Element flow, + Table table ) + { + final int tableRows = table.numRows(); + for ( int r = 0; r < tableRows; r++ ) + { + TableRow tableRow = table.getRow( r ); + + Element tableRowElement = textDocumentFacade.createTableRow(); + + final int rowCells = tableRow.numCells(); + for ( int c = 0; c < rowCells; c++ ) + { + TableCell tableCell = tableRow.getCell( c ); + + Element tableCellElement = textDocumentFacade.createTableCell(); + + if ( c != 0 ) + tableCellElement.appendChild( textDocumentFacade + .createText( "\t" ) ); + + processParagraphes( hwpfDocument, tableCellElement, tableCell, + table.getTableLevel() ); + tableRowElement.appendChild( tableCellElement ); + } + + tableRowElement.appendChild( textDocumentFacade.createText( "\n" ) ); + flow.appendChild( tableRowElement ); + } + } + +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java index 0de7ab3ef7..52473c824a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java @@ -45,7 +45,8 @@ public class TestWordToConverterSuite public static Test suite() { - TestSuite suite = new TestSuite(TestWordToConverterSuite.class.getName()); + TestSuite suite = new TestSuite( + TestWordToConverterSuite.class.getName() ); File directory = POIDataSamples.getDocumentInstance().getFile( "../document" ); @@ -63,14 +64,21 @@ public class TestWordToConverterSuite { public void runTest() throws Exception { - test( child, false ); + testFo( child ); } } ); suite.addTest( new TestCase( name + " [HTML]" ) { public void runTest() throws Exception { - test( child, true ); + testHtml( child ); + } + } ); + suite.addTest( new TestCase( name + " [TEXT]" ) + { + public void runTest() throws Exception + { + testText( child ); } } ); @@ -79,7 +87,7 @@ public class TestWordToConverterSuite return suite; } - protected static void test( File child, boolean html ) throws Exception + protected static void testFo( File child ) throws Exception { HWPFDocumentCore hwpfDocument; try @@ -88,7 +96,6 @@ public class TestWordToConverterSuite } catch ( Exception exc ) { - // unable to parse file -- not WordToFoConverter fault return; } @@ -102,14 +109,74 @@ public class TestWordToConverterSuite Transformer transformer = TransformerFactory.newInstance() .newTransformer(); transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); - transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.setOutputProperty( OutputKeys.INDENT, "false" ); transformer.transform( new DOMSource( wordToFoConverter.getDocument() ), new StreamResult( stringWriter ) ); - if ( html ) - transformer.setOutputProperty( OutputKeys.METHOD, "html" ); + // no exceptions + } + + protected static void testHtml( File child ) throws Exception + { + HWPFDocumentCore hwpfDocument; + try + { + hwpfDocument = AbstractWordUtils.loadDoc( child ); + } + catch ( Exception exc ) + { + return; + } + + WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToHtmlConverter.processDocument( hwpfDocument ); + + StringWriter stringWriter = new StringWriter(); + + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); + transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); + transformer.setOutputProperty( OutputKeys.INDENT, "false" ); + transformer.setOutputProperty( OutputKeys.METHOD, "html" ); + transformer.transform( + new DOMSource( wordToHtmlConverter.getDocument() ), + new StreamResult( stringWriter ) ); + + // no exceptions + } + + protected static void testText( File child ) throws Exception + { + HWPFDocumentCore wordDocument; + try + { + wordDocument = AbstractWordUtils.loadDoc( child ); + } + catch ( Exception exc ) + { + return; + } + + WordToTextConverter wordToTextConverter = new WordToTextConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToTextConverter.processDocument( wordDocument ); + + StringWriter stringWriter = new StringWriter(); + + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); + transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); + transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.setOutputProperty( OutputKeys.METHOD, "text" ); + transformer.transform( + new DOMSource( wordToTextConverter.getDocument() ), + new StreamResult( stringWriter ) ); + stringWriter.toString(); // no exceptions } }