]> source.dussan.org Git - poi.git/commitdiff
add Word-to-Text converter
authorSergey Vladimirov <sergey@apache.org>
Tue, 9 Aug 2011 09:25:59 +0000 (09:25 +0000)
committerSergey Vladimirov <sergey@apache.org>
Tue, 9 Aug 2011 09:25:59 +0000 (09:25 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155281 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
src/scratchpad/src/org/apache/poi/hwpf/converter/TextDocumentFacade.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java [new file with mode: 0644]
src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java

index 3612011dd53bf58fc395b2ed853e262360485240..e71aed0ac69f49ca7210a669888f87808de07758 100644 (file)
@@ -71,9 +71,9 @@ public abstract class AbstractWordConverter
 
     private static final byte SPECCHAR_DRAWN_OBJECT = 8;
 
-    private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
+    protected static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
 
-    private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
+    protected static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
 
     private static void addToStructures( List<Structure> structures,
             Structure structure )
@@ -205,7 +205,7 @@ public abstract class AbstractWordConverter
             Element currentBlock, Range range, int currentTableLevel,
             List<Bookmark> rangeBookmarks );
 
-    protected boolean processCharacters( final HWPFDocumentCore document,
+    protected boolean processCharacters( final HWPFDocumentCore wordDocument,
             final int currentTableLevel, final Range range, final Element block )
     {
         if ( range == null )
@@ -220,9 +220,9 @@ public abstract class AbstractWordConverter
          * reconstruct the structure of range -- sergey
          */
         List<Structure> structures = new LinkedList<Structure>();
-        if ( document instanceof HWPFDocument )
+        if ( wordDocument instanceof HWPFDocument )
         {
-            final HWPFDocument doc = (HWPFDocument) document;
+            final HWPFDocument doc = (HWPFDocument) wordDocument;
 
             Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks()
                     .getBookmarksStartedBetween( range.getStartOffset(),
@@ -247,7 +247,7 @@ public abstract class AbstractWordConverter
                 CharacterRun characterRun = range.getCharacterRun( c );
                 if ( characterRun == null )
                     throw new AssertionError();
-                Field aliveField = ( (HWPFDocument) document ).getFields()
+                Field aliveField = ( (HWPFDocument) wordDocument ).getFields()
                         .getFieldByStartOffset( FieldsDocumentPart.MAIN,
                                 characterRun.getStartOffset() );
                 if ( aliveField != null )
@@ -273,14 +273,15 @@ public abstract class AbstractWordConverter
                         return "BetweenStructuresSubrange " + super.toString();
                     }
                 };
-                processCharacters( document, currentTableLevel, subrange, block );
+                processCharacters( wordDocument, currentTableLevel, subrange,
+                        block );
             }
 
             if ( structure.structure instanceof Bookmark )
             {
                 // other bookmarks with same bundaries
                 List<Bookmark> bookmarks = new LinkedList<Bookmark>();
-                for ( Bookmark bookmark : ( (HWPFDocument) document )
+                for ( Bookmark bookmark : ( (HWPFDocument) wordDocument )
                         .getBookmarks()
                         .getBookmarksStartedBetween( structure.start,
                                 structure.start + 1 ).values().iterator()
@@ -306,7 +307,7 @@ public abstract class AbstractWordConverter
                         }
                     };
 
-                    processBookmarks( document, block, subrange,
+                    processBookmarks( wordDocument, block, subrange,
                             currentTableLevel, bookmarks );
                 }
                 finally
@@ -317,7 +318,7 @@ public abstract class AbstractWordConverter
             else if ( structure.structure instanceof Field )
             {
                 Field field = (Field) structure.structure;
-                processField( (HWPFDocument) document, range,
+                processField( (HWPFDocument) wordDocument, range,
                         currentTableLevel, field, block );
             }
             else
@@ -349,7 +350,8 @@ public abstract class AbstractWordConverter
                         return "AfterStructureSubrange " + super.toString();
                     }
                 };
-                processCharacters( document, currentTableLevel, subrange, block );
+                processCharacters( wordDocument, currentTableLevel, subrange,
+                        block );
             }
             return true;
         }
@@ -361,11 +363,11 @@ public abstract class AbstractWordConverter
             if ( characterRun == null )
                 throw new AssertionError();
 
-            if ( document instanceof HWPFDocument
-                    && ( (HWPFDocument) document ).getPicturesTable()
+            if ( wordDocument instanceof HWPFDocument
+                    && ( (HWPFDocument) wordDocument ).getPicturesTable()
                             .hasPicture( characterRun ) )
             {
-                HWPFDocument newFormat = (HWPFDocument) document;
+                HWPFDocument newFormat = (HWPFDocument) wordDocument;
                 Picture picture = newFormat.getPicturesTable().extractPicture(
                         characterRun, true );
 
@@ -381,16 +383,16 @@ public abstract class AbstractWordConverter
             if ( characterRun.isSpecialCharacter() )
             {
                 if ( text.charAt( 0 ) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE
-                        && ( document instanceof HWPFDocument ) )
+                        && ( wordDocument instanceof HWPFDocument ) )
                 {
-                    HWPFDocument doc = (HWPFDocument) document;
+                    HWPFDocument doc = (HWPFDocument) wordDocument;
                     processNoteAnchor( doc, characterRun, block );
                     continue;
                 }
                 if ( text.charAt( 0 ) == SPECCHAR_DRAWN_OBJECT
-                        && ( document instanceof HWPFDocument ) )
+                        && ( wordDocument instanceof HWPFDocument ) )
                 {
-                    HWPFDocument doc = (HWPFDocument) document;
+                    HWPFDocument doc = (HWPFDocument) wordDocument;
                     processDrawnObject( doc, characterRun, block );
                     continue;
                 }
@@ -398,14 +400,15 @@ public abstract class AbstractWordConverter
 
             if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
             {
-                if ( document instanceof HWPFDocument )
+                if ( wordDocument instanceof HWPFDocument )
                 {
-                    Field aliveField = ( (HWPFDocument) document ).getFields()
-                            .getFieldByStartOffset( FieldsDocumentPart.MAIN,
+                    Field aliveField = ( (HWPFDocument) wordDocument )
+                            .getFields().getFieldByStartOffset(
+                                    FieldsDocumentPart.MAIN,
                                     characterRun.getStartOffset() );
                     if ( aliveField != null )
                     {
-                        processField( ( (HWPFDocument) document ), range,
+                        processField( ( (HWPFDocument) wordDocument ), range,
                                 currentTableLevel, aliveField, block );
 
                         int continueAfter = aliveField.getFieldEndOffset();
@@ -420,8 +423,8 @@ public abstract class AbstractWordConverter
                     }
                 }
 
-                int skipTo = tryDeadField( document, range, currentTableLevel,
-                        c, block );
+                int skipTo = tryDeadField( wordDocument, range,
+                        currentTableLevel, c, block );
 
                 if ( skipTo != c )
                 {
@@ -610,7 +613,7 @@ public abstract class AbstractWordConverter
             CharacterRun characterRun, OfficeDrawing officeDrawing,
             String path, Element block );
 
-    protected abstract void processEndnoteAutonumbered( HWPFDocument doc,
+    protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
             int noteIndex, Element block, Range endnoteTextRange );
 
     protected void processField( HWPFDocument hwpfDocument, Range parentRange,
@@ -666,7 +669,7 @@ public abstract class AbstractWordConverter
                 field.secondSubrange( parentRange ), currentBlock );
     }
 
-    protected abstract void processFootnoteAutonumbered( HWPFDocument doc,
+    protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
             int noteIndex, Element block, Range footnoteTextRange );
 
     protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
@@ -734,8 +737,8 @@ public abstract class AbstractWordConverter
             String pageref );
 
     protected abstract void processParagraph( HWPFDocumentCore wordDocument,
-            Element parentFopElement, int currentTableLevel,
-            Paragraph paragraph, String bulletText );
+            Element parentElement, int currentTableLevel, Paragraph paragraph,
+            String bulletText );
 
     protected void processParagraphes( HWPFDocumentCore wordDocument,
             Element flow, Range range, int currentTableLevel )
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/TextDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/TextDocumentFacade.java
new file mode 100644 (file)
index 0000000..4eb352c
--- /dev/null
@@ -0,0 +1,179 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.converter;
+
+import org.apache.poi.util.Beta;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+@Beta
+public class TextDocumentFacade
+{
+    protected final Element body;
+    protected final Document document;
+    protected final Element head;
+    protected final Element root;
+
+    protected Element title;
+    protected Text titleText;
+
+    public TextDocumentFacade( Document document )
+    {
+        this.document = document;
+
+        root = document.createElement( "html" );
+        document.appendChild( root );
+
+        body = document.createElement( "body" );
+        head = document.createElement( "head" );
+
+        root.appendChild( head );
+        root.appendChild( body );
+        
+        title = document.createElement( "title" );
+        titleText = document.createTextNode( "" );
+        head.appendChild( title );
+    }
+
+    public void addAuthor( String value )
+    {
+        addMeta( "Author", value );
+    }
+
+    public void addDescription( String value )
+    {
+        addMeta( "Description", value );
+    }
+
+    public void addKeywords( String value )
+    {
+        addMeta( "Keywords", value );
+    }
+
+    public void addMeta( final String name, String value )
+    {
+        Element meta = document.createElement( "meta" );
+
+        Element metaName = document.createElement( "name" );
+        metaName.appendChild( document.createTextNode( name + ": " ) );
+        meta.appendChild( metaName );
+
+        Element metaValue = document.createElement( "value" );
+        metaValue.appendChild( document.createTextNode( value + "\n" ) );
+        meta.appendChild( metaValue );
+
+        head.appendChild( meta );
+    }
+
+    public Element createBlock()
+    {
+        return document.createElement( "div" );
+    }
+
+    public Element createHeader1()
+    {
+        Element result = document.createElement( "h1" );
+        result.appendChild( document.createTextNode( "        " ) );
+        return result;
+    }
+
+    public Element createHeader2()
+    {
+        Element result = document.createElement( "h2" );
+        result.appendChild( document.createTextNode( "    " ) );
+        return result;
+    }
+
+    public Element createParagraph()
+    {
+        return document.createElement( "p" );
+    }
+
+    public Element createTable()
+    {
+        return document.createElement( "table" );
+    }
+
+    public Element createTableBody()
+    {
+        return document.createElement( "tbody" );
+    }
+
+    public Element createTableCell()
+    {
+        return document.createElement( "td" );
+    }
+
+    public Element createTableRow()
+    {
+        return document.createElement( "tr" );
+    }
+
+    public Text createText( String data )
+    {
+        return document.createTextNode( data );
+    }
+
+    public Element createUnorderedList()
+    {
+        return document.createElement( "ul" );
+    }
+
+    public Element getBody()
+    {
+        return body;
+    }
+
+    public Document getDocument()
+    {
+        return document;
+    }
+
+    public Element getHead()
+    {
+        return head;
+    }
+
+    public String getTitle()
+    {
+        if ( title == null )
+            return null;
+
+        return titleText.getTextContent();
+    }
+
+    public void setTitle( String titleText )
+    {
+        if ( WordToHtmlUtils.isEmpty( titleText ) && this.title != null )
+        {
+            this.head.removeChild( this.title );
+            this.title = null;
+            this.titleText = null;
+        }
+
+        if ( this.title == null )
+        {
+            this.title = document.createElement( "title" );
+            this.titleText = document.createTextNode( titleText );
+            this.title.appendChild( this.titleText );
+            this.head.appendChild( title );
+        }
+
+        this.titleText.setData( titleText );
+    }
+}
index 2bf9dc55fe42840acab240cb551b116daf4137b3..dd9cfe8824055376b2d6f9b1e4d527d6ac759fc7 100644 (file)
@@ -276,7 +276,7 @@ public class WordToFoConverter extends AbstractWordConverter
     }
 
     @Override
-    protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex,
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
             Element block, Range endnoteTextRange )
     {
         final String textIndex = String.valueOf( internalLinkCounter
@@ -297,14 +297,14 @@ public class WordToFoConverter extends AbstractWordConverter
         setId( backwardLink, forwardLinkName );
         endnote.appendChild( backwardLink );
 
-        processCharacters( doc, Integer.MIN_VALUE, endnoteTextRange, endnote );
+        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
 
         WordToFoUtils.compactInlines( endnote );
         this.endnotes.add( endnote );
     }
 
     @Override
-    protected void processFootnoteAutonumbered( HWPFDocument doc,
+    protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
             int noteIndex, Element block, Range footnoteTextRange )
     {
         final String textIndex = String.valueOf( internalLinkCounter
@@ -333,7 +333,7 @@ public class WordToFoConverter extends AbstractWordConverter
         footnoteBody.appendChild( footnoteBlock );
         footNote.appendChild( footnoteBody );
 
-        processCharacters( doc, Integer.MIN_VALUE, footnoteTextRange,
+        processCharacters( wordDocument, Integer.MIN_VALUE, footnoteTextRange,
                 footnoteBlock );
 
         WordToFoUtils.compactInlines( footnoteBlock );
index a0af1955909fd77b38a0592ba339c1b89cb4b049..040d32a8793331b829819a7b10ac0018411a5eb4 100644 (file)
@@ -282,17 +282,17 @@ public class WordToHtmlConverter extends AbstractWordConverter
     }
 
     @Override
-    protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex,
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
             Element block, Range endnoteTextRange )
     {
-        processNoteAutonumbered( doc, "end", noteIndex, block, endnoteTextRange );
+        processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
     }
 
     @Override
-    protected void processFootnoteAutonumbered( HWPFDocument doc,
+    protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
             int noteIndex, Element block, Range footnoteTextRange )
     {
-        processNoteAutonumbered( doc, "foot", noteIndex, block,
+        processNoteAutonumbered( wordDocument, "foot", noteIndex, block,
                 footnoteTextRange );
     }
 
@@ -508,11 +508,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
     }
 
     protected void processParagraph( HWPFDocumentCore hwpfDocument,
-            Element parentFopElement, int currentTableLevel,
-            Paragraph paragraph, String bulletText )
+            Element parentElement, int currentTableLevel, Paragraph paragraph,
+            String bulletText )
     {
         final Element pElement = htmlDocumentFacade.createParagraph();
-        parentFopElement.appendChild( pElement );
+        parentElement.appendChild( pElement );
 
         StringBuilder style = new StringBuilder();
         WordToHtmlUtils.addParagraphProperties( paragraph, style );
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
new file mode 100644 (file)
index 0000000..edea5da
--- /dev/null
@@ -0,0 +1,288 @@
+package org.apache.poi.hwpf.converter;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.usermodel.Bookmark;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.OfficeDrawing;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.util.Beta;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+@Beta
+public class WordToTextConverter extends AbstractWordConverter
+{
+
+    /**
+     * Java main() interface to interact with {@link WordToTextConverter}
+     * 
+     * <p>
+     * Usage: WordToTextConverter infile outfile
+     * </p>
+     * Where infile is an input .doc file ( Word 95-2007) which will be rendered
+     * as plain text into outfile
+     */
+    public static void main( String[] args )
+    {
+        if ( args.length < 2 )
+        {
+            System.err
+                    .println( "Usage: WordToTextConverter <inputFile.doc> <saveTo.txt>" );
+            return;
+        }
+
+        System.out.println( "Converting " + args[0] );
+        System.out.println( "Saving output to " + args[1] );
+        try
+        {
+            Document doc = WordToTextConverter.process( new File( args[0] ) );
+
+            FileWriter out = new FileWriter( args[1] );
+            DOMSource domSource = new DOMSource( doc );
+            StreamResult streamResult = new StreamResult( out );
+
+            TransformerFactory tf = TransformerFactory.newInstance();
+            Transformer serializer = tf.newTransformer();
+            // TODO set encoding from a command argument
+            serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+            serializer.setOutputProperty( OutputKeys.INDENT, "no" );
+            serializer.setOutputProperty( OutputKeys.METHOD, "text" );
+            serializer.transform( domSource, streamResult );
+            out.close();
+        }
+        catch ( Exception e )
+        {
+            e.printStackTrace();
+        }
+    }
+
+    static Document process( File docFile ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils
+                .loadDoc( docFile );
+        WordToTextConverter wordToTextConverter = new WordToTextConverter(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToTextConverter.processDocument( wordDocument );
+        return wordToTextConverter.getDocument();
+    }
+
+    private AtomicInteger noteCounters = new AtomicInteger( 1 );
+
+    private Element notes = null;
+
+    private final TextDocumentFacade textDocumentFacade;
+
+    /**
+     * Creates new instance of {@link WordToTextConverter}. Can be used for
+     * output several {@link HWPFDocument}s into single text document.
+     * 
+     * @param document
+     *            XML DOM Document used as storage for text pieces
+     */
+    public WordToTextConverter( Document document )
+    {
+        this.textDocumentFacade = new TextDocumentFacade( document );
+    }
+
+    public Document getDocument()
+    {
+        return textDocumentFacade.getDocument();
+    }
+
+    @Override
+    protected void outputCharacters( Element block, CharacterRun characterRun,
+            String text )
+    {
+        block.appendChild( textDocumentFacade.createText( text ) );
+    }
+
+    @Override
+    protected void processBookmarks( HWPFDocumentCore wordDocument,
+            Element currentBlock, Range range, int currentTableLevel,
+            List<Bookmark> rangeBookmarks )
+    {
+        processCharacters( wordDocument, currentTableLevel, range, currentBlock );
+    }
+
+    @Override
+    public void processDocument( HWPFDocumentCore wordDocument )
+    {
+        super.processDocument( wordDocument );
+
+        if ( notes != null )
+            textDocumentFacade.getBody().appendChild( notes );
+    }
+
+    @Override
+    protected void processDocumentInformation(
+            SummaryInformation summaryInformation )
+    {
+        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
+            textDocumentFacade.setTitle( summaryInformation.getTitle() );
+
+        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
+            textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
+
+        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
+            textDocumentFacade
+                    .addDescription( summaryInformation.getComments() );
+
+        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
+            textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
+    }
+
+    @Override
+    protected void processDrawnObject( HWPFDocument doc,
+            CharacterRun characterRun, OfficeDrawing officeDrawing,
+            String path, Element block )
+    {
+        // ignore
+    }
+
+    @Override
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
+    {
+        processNote( wordDocument, block, endnoteTextRange );
+    }
+
+    @Override
+    protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range footnoteTextRange )
+    {
+        processNote( wordDocument, block, footnoteTextRange );
+    }
+
+    @Override
+    protected void processHyperlink( HWPFDocumentCore wordDocument,
+            Element currentBlock, Range textRange, int currentTableLevel,
+            String hyperlink )
+    {
+        processCharacters( wordDocument, currentTableLevel, textRange,
+                currentBlock );
+
+        currentBlock.appendChild( textDocumentFacade.createText( " ("
+                + UNICODECHAR_ZERO_WIDTH_SPACE
+                + hyperlink.replaceAll( "\\/", UNICODECHAR_ZERO_WIDTH_SPACE
+                        + "\\/" + UNICODECHAR_ZERO_WIDTH_SPACE )
+                + UNICODECHAR_ZERO_WIDTH_SPACE + ")" ) );
+    }
+
+    @Override
+    protected void processImage( Element currentBlock, boolean inlined,
+            Picture picture )
+    {
+        // ignore
+    }
+
+    @Override
+    protected void processLineBreak( Element block, CharacterRun characterRun )
+    {
+        block.appendChild( textDocumentFacade.createText( "\n" ) );
+    }
+
+    protected void processNote( HWPFDocument wordDocument, Element block,
+            Range noteTextRange )
+    {
+        final int noteIndex = noteCounters.getAndIncrement();
+        block.appendChild( textDocumentFacade
+                .createText( UNICODECHAR_ZERO_WIDTH_SPACE + "[" + noteIndex
+                        + "]" + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+
+        if ( notes == null )
+            notes = textDocumentFacade.createBlock();
+
+        Element note = textDocumentFacade.createBlock();
+        notes.appendChild( note );
+
+        note.appendChild( textDocumentFacade.createText( "^" + noteIndex
+                + "\t " ) );
+        processCharacters( wordDocument, Integer.MIN_VALUE, noteTextRange, note );
+        note.appendChild( textDocumentFacade.createText( "\n" ) );
+    }
+
+    @Override
+    protected void processPageref( HWPFDocumentCore wordDocument,
+            Element currentBlock, Range textRange, int currentTableLevel,
+            String pageref )
+    {
+        processCharacters( wordDocument, currentTableLevel, textRange,
+                currentBlock );
+    }
+
+    @Override
+    protected void processParagraph( HWPFDocumentCore wordDocument,
+            Element parentElement, int currentTableLevel, Paragraph paragraph,
+            String bulletText )
+    {
+        Element pElement = textDocumentFacade.createParagraph();
+        pElement.appendChild( textDocumentFacade.createText( bulletText ) );
+        processCharacters( wordDocument, currentTableLevel, paragraph, pElement );
+        pElement.appendChild( textDocumentFacade.createText( "\n" ) );
+        parentElement.appendChild( pElement );
+    }
+
+    @Override
+    protected void processSection( HWPFDocumentCore wordDocument,
+            Section section, int s )
+    {
+        Element sectionElement = textDocumentFacade.createBlock();
+        processParagraphes( wordDocument, sectionElement, section,
+                Integer.MIN_VALUE );
+        sectionElement.appendChild( textDocumentFacade.createText( "\n" ) );
+        textDocumentFacade.body.appendChild( sectionElement );
+    }
+
+    protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+            Table table )
+    {
+        final int tableRows = table.numRows();
+        for ( int r = 0; r < tableRows; r++ )
+        {
+            TableRow tableRow = table.getRow( r );
+
+            Element tableRowElement = textDocumentFacade.createTableRow();
+
+            final int rowCells = tableRow.numCells();
+            for ( int c = 0; c < rowCells; c++ )
+            {
+                TableCell tableCell = tableRow.getCell( c );
+
+                Element tableCellElement = textDocumentFacade.createTableCell();
+
+                if ( c != 0 )
+                    tableCellElement.appendChild( textDocumentFacade
+                            .createText( "\t" ) );
+
+                processParagraphes( hwpfDocument, tableCellElement, tableCell,
+                        table.getTableLevel() );
+                tableRowElement.appendChild( tableCellElement );
+            }
+
+            tableRowElement.appendChild( textDocumentFacade.createText( "\n" ) );
+            flow.appendChild( tableRowElement );
+        }
+    }
+
+}
index 0de7ab3ef71272f375b9b99f520c7cc9f11e1526..52473c824a00128e2de5bea06e98133317aef5f4 100644 (file)
@@ -45,7 +45,8 @@ public class TestWordToConverterSuite
 
     public static Test suite()
     {
-        TestSuite suite = new TestSuite(TestWordToConverterSuite.class.getName());
+        TestSuite suite = new TestSuite(
+                TestWordToConverterSuite.class.getName() );
 
         File directory = POIDataSamples.getDocumentInstance().getFile(
                 "../document" );
@@ -63,14 +64,21 @@ public class TestWordToConverterSuite
             {
                 public void runTest() throws Exception
                 {
-                    test( child, false );
+                    testFo( child );
                 }
             } );
             suite.addTest( new TestCase( name + " [HTML]" )
             {
                 public void runTest() throws Exception
                 {
-                    test( child, true );
+                    testHtml( child );
+                }
+            } );
+            suite.addTest( new TestCase( name + " [TEXT]" )
+            {
+                public void runTest() throws Exception
+                {
+                    testText( child );
                 }
             } );
 
@@ -79,7 +87,7 @@ public class TestWordToConverterSuite
         return suite;
     }
 
-    protected static void test( File child, boolean html ) throws Exception
+    protected static void testFo( File child ) throws Exception
     {
         HWPFDocumentCore hwpfDocument;
         try
@@ -88,7 +96,6 @@ public class TestWordToConverterSuite
         }
         catch ( Exception exc )
         {
-            // unable to parse file -- not WordToFoConverter fault
             return;
         }
 
@@ -102,14 +109,74 @@ public class TestWordToConverterSuite
         Transformer transformer = TransformerFactory.newInstance()
                 .newTransformer();
         transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
-        transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
+        transformer.setOutputProperty( OutputKeys.INDENT, "false" );
         transformer.transform(
                 new DOMSource( wordToFoConverter.getDocument() ),
                 new StreamResult( stringWriter ) );
 
-        if ( html )
-            transformer.setOutputProperty( OutputKeys.METHOD, "html" );
+        // no exceptions
+    }
+
+    protected static void testHtml( File child ) throws Exception
+    {
+        HWPFDocumentCore hwpfDocument;
+        try
+        {
+            hwpfDocument = AbstractWordUtils.loadDoc( child );
+        }
+        catch ( Exception exc )
+        {
+            return;
+        }
+
+        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToHtmlConverter.processDocument( hwpfDocument );
+
+        StringWriter stringWriter = new StringWriter();
+
+        Transformer transformer = TransformerFactory.newInstance()
+                .newTransformer();
+        transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
+        transformer.setOutputProperty( OutputKeys.INDENT, "false" );
+        transformer.setOutputProperty( OutputKeys.METHOD, "html" );
+        transformer.transform(
+                new DOMSource( wordToHtmlConverter.getDocument() ),
+                new StreamResult( stringWriter ) );
+
+        // no exceptions
+    }
+
+    protected static void testText( File child ) throws Exception
+    {
+        HWPFDocumentCore wordDocument;
+        try
+        {
+            wordDocument = AbstractWordUtils.loadDoc( child );
+        }
+        catch ( Exception exc )
+        {
+            return;
+        }
+
+        WordToTextConverter wordToTextConverter = new WordToTextConverter(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToTextConverter.processDocument( wordDocument );
+
+        StringWriter stringWriter = new StringWriter();
+
+        Transformer transformer = TransformerFactory.newInstance()
+                .newTransformer();
+        transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
+        transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
+        transformer.setOutputProperty( OutputKeys.METHOD, "text" );
+        transformer.transform(
+                new DOMSource( wordToTextConverter.getDocument() ),
+                new StreamResult( stringWriter ) );
 
+        stringWriter.toString();
         // no exceptions
     }
 }