]> source.dussan.org Git - poi.git/commitdiff
add Word-to-HTML extractor
authorSergey Vladimirov <sergey@apache.org>
Mon, 4 Jul 2011 19:08:06 +0000 (19:08 +0000)
committerSergey Vladimirov <sergey@apache.org>
Mon, 4 Jul 2011 19:08:06 +0000 (19:08 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1142765 13f79535-47bb-0310-9956-ffa450edef68

12 files changed:
src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java [deleted file]
src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java [new file with mode: 0644]
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java [new file with mode: 0644]
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractorSuite.java [deleted file]
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java [new file with mode: 0644]

diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java
deleted file mode 100644 (file)
index 19608d8..0000000
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- *  ====================================================================
- *    Licensed to the Apache Software Foundation (ASF) under one or more
- *    contributor license agreements.  See the NOTICE file distributed with
- *    this work for additional information regarding copyright ownership.
- *    The ASF licenses this file to You under the Apache License, Version 2.0
- *    (the "License"); you may not use this file except in compliance with
- *    the License.  You may obtain a copy of the License at
- *
- *        http://www.apache.org/licenses/LICENSE-2.0
- *
- *    Unless required by applicable law or agreed to in writing, software
- *    distributed under the License is distributed on an "AS IS" BASIS,
- *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *    See the License for the specific language governing permissions and
- *    limitations under the License.
- * ====================================================================
- */
-package org.apache.poi.hwpf.extractor;
-
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Text;
-
-public abstract class AbstractToFoExtractor
-{
-
-    private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
-
-    protected final Document document;
-    protected final Element layoutMasterSet;
-    protected final Element root;
-
-    public AbstractToFoExtractor( Document document )
-    {
-        this.document = document;
-
-        root = document.createElementNS( NS_XSLFO, "fo:root" );
-        document.appendChild( root );
-
-        layoutMasterSet = document.createElementNS( NS_XSLFO,
-                "fo:layout-master-set" );
-        root.appendChild( layoutMasterSet );
-    }
-
-    protected Element addFlowToPageSequence( final Element pageSequence,
-            String flowName )
-    {
-        final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" );
-        flow.setAttribute( "flow-name", flowName );
-        pageSequence.appendChild( flow );
-
-        return flow;
-    }
-
-    protected Element addListItem( Element listBlock )
-    {
-        Element result = createListItem();
-        listBlock.appendChild( result );
-        return result;
-    }
-
-    protected Element addListItemBody( Element listItem )
-    {
-        Element result = createListItemBody();
-        listItem.appendChild( result );
-        return result;
-    }
-
-    protected Element addListItemLabel( Element listItem, String text )
-    {
-        Element result = createListItemLabel( text );
-        listItem.appendChild( result );
-        return result;
-    }
-
-    protected Element addPageSequence( String pageMaster )
-    {
-        final Element pageSequence = document.createElementNS( NS_XSLFO,
-                "fo:page-sequence" );
-        pageSequence.setAttribute( "master-reference", pageMaster );
-        root.appendChild( pageSequence );
-        return pageSequence;
-    }
-
-    protected Element addRegionBody( Element pageMaster )
-    {
-        final Element regionBody = document.createElementNS( NS_XSLFO,
-                "fo:region-body" );
-        pageMaster.appendChild( regionBody );
-
-        return regionBody;
-    }
-
-    protected Element addSimplePageMaster( String masterName )
-    {
-        final Element simplePageMaster = document.createElementNS( NS_XSLFO,
-                "fo:simple-page-master" );
-        simplePageMaster.setAttribute( "master-name", masterName );
-        layoutMasterSet.appendChild( simplePageMaster );
-
-        return simplePageMaster;
-    }
-
-    protected Element createBasicLinkExternal( String externalDestination )
-    {
-        final Element basicLink = document.createElementNS( NS_XSLFO,
-                "fo:basic-link" );
-        basicLink.setAttribute( "external-destination", externalDestination );
-        return basicLink;
-    }
-
-    protected Element createBasicLinkInternal( String internalDestination )
-    {
-        final Element basicLink = document.createElementNS( NS_XSLFO,
-                "fo:basic-link" );
-        basicLink.setAttribute( "internal-destination", internalDestination );
-        return basicLink;
-    }
-
-    protected Element createBlock()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:block" );
-    }
-
-    protected Element createExternalGraphic( String source )
-    {
-        Element result = document.createElementNS( NS_XSLFO,
-                "fo:external-graphic" );
-        result.setAttribute( "src", "url('" + source + "')" );
-        return result;
-    }
-
-    protected Element createInline()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:inline" );
-    }
-
-    protected Element createLeader()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:leader" );
-    }
-
-    protected Element createListBlock()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:list-block" );
-    }
-
-    protected Element createListItem()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:list-item" );
-    }
-
-    protected Element createListItemBody()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:list-item-body" );
-    }
-
-    protected Element createListItemLabel( String text )
-    {
-        Element result = document.createElementNS( NS_XSLFO,
-                "fo:list-item-label" );
-        Element block = createBlock();
-        block.appendChild( document.createTextNode( text ) );
-        result.appendChild( block );
-        return result;
-    }
-
-    protected Element createTable()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:table" );
-    }
-
-    protected Element createTableBody()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:table-body" );
-    }
-
-    protected Element createTableCell()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:table-cell" );
-    }
-
-    protected Element createTableHeader()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:table-header" );
-    }
-
-    protected Element createTableRow()
-    {
-        return document.createElementNS( NS_XSLFO, "fo:table-row" );
-    }
-
-    protected Text createText( String data )
-    {
-        return document.createTextNode( data );
-    }
-
-    public Document getDocument()
-    {
-        return document;
-    }
-
-}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java
new file mode 100644 (file)
index 0000000..f13d9a1
--- /dev/null
@@ -0,0 +1,365 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.model.ListFormatOverride;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableIterator;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+public abstract class AbstractWordExtractor
+{
+    private static final byte BEL_MARK = 7;
+
+    private static final byte FIELD_BEGIN_MARK = 19;
+
+    private static final byte FIELD_END_MARK = 21;
+
+    private static final byte FIELD_SEPARATOR_MARK = 20;
+
+    private static final POILogger logger = POILogFactory
+            .getLogger( AbstractWordExtractor.class );
+
+    public abstract Document getDocument();
+
+    protected abstract void outputCharacters( Element block,
+            CharacterRun characterRun, String text );
+
+    protected boolean processCharacters( HWPFDocumentCore hwpfDocument,
+            int currentTableLevel, Paragraph paragraph, final Element block,
+            List<CharacterRun> characterRuns, final int start, final int end )
+    {
+        boolean haveAnyText = false;
+
+        for ( int c = start; c < end; c++ )
+        {
+            CharacterRun characterRun = characterRuns.get( c );
+
+            if ( characterRun == null )
+                throw new AssertionError();
+
+            if ( hwpfDocument instanceof HWPFDocument
+                    && ( (HWPFDocument) hwpfDocument ).getPicturesTable()
+                            .hasPicture( characterRun ) )
+            {
+                HWPFDocument newFormat = (HWPFDocument) hwpfDocument;
+                Picture picture = newFormat.getPicturesTable().extractPicture(
+                        characterRun, true );
+
+                processImage( block, characterRun.text().charAt( 0 ) == 0x01,
+                        picture );
+                continue;
+            }
+
+            String text = characterRun.text();
+            if ( text.getBytes().length == 0 )
+                continue;
+
+            if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
+            {
+                int skipTo = tryField( hwpfDocument, paragraph,
+                        currentTableLevel, characterRuns, c, block );
+
+                if ( skipTo != c )
+                {
+                    c = skipTo;
+                    continue;
+                }
+
+                continue;
+            }
+            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
+            {
+                // shall not appear without FIELD_BEGIN_MARK
+                continue;
+            }
+            if ( text.getBytes()[0] == FIELD_END_MARK )
+            {
+                // shall not appear without FIELD_BEGIN_MARK
+                continue;
+            }
+
+            if ( characterRun.isSpecialCharacter() || characterRun.isObj()
+                    || characterRun.isOle2() )
+            {
+                continue;
+            }
+
+            if ( text.endsWith( "\r" )
+                    || ( text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0 ) )
+                text = text.substring( 0, text.length() - 1 );
+
+            outputCharacters( block, characterRun, text );
+
+            haveAnyText |= text.trim().length() != 0;
+        }
+
+        return haveAnyText;
+    }
+
+    public void processDocument( HWPFDocumentCore wordDocument )
+    {
+        final Range range = wordDocument.getRange();
+        for ( int s = 0; s < range.numSections(); s++ )
+        {
+            processSection( wordDocument, range.getSection( s ), s );
+        }
+    }
+
+    protected void processField( HWPFDocumentCore wordDocument,
+            Element currentBlock, Paragraph paragraph, int currentTableLevel,
+            List<CharacterRun> characterRuns, int beginMark, int separatorMark,
+            int endMark )
+    {
+
+        Pattern hyperlinkPattern = Pattern
+                .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
+        Pattern pagerefPattern = Pattern
+                .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
+
+        if ( separatorMark - beginMark > 1 )
+        {
+            int index = beginMark + 1;
+            CharacterRun firstAfterBegin = null;
+            while ( index < separatorMark )
+            {
+                firstAfterBegin = paragraph.getCharacterRun( index );
+                if ( firstAfterBegin == null )
+                {
+                    logger.log( POILogger.WARN,
+                            "Paragraph " + paragraph.getStartOffset() + "--"
+                                    + paragraph.getEndOffset()
+                                    + " contains null CharacterRun #" + index );
+                    index++;
+                    continue;
+                }
+                break;
+            }
+
+            if ( firstAfterBegin != null )
+            {
+                final Matcher hyperlinkMatcher = hyperlinkPattern
+                        .matcher( firstAfterBegin.text() );
+                if ( hyperlinkMatcher.matches() )
+                {
+                    String hyperlink = hyperlinkMatcher.group( 1 );
+                    processHyperlink( wordDocument, currentBlock, paragraph,
+                            characterRuns, currentTableLevel, hyperlink,
+                            separatorMark + 1, endMark );
+                    return;
+                }
+
+                final Matcher pagerefMatcher = pagerefPattern
+                        .matcher( firstAfterBegin.text() );
+                if ( pagerefMatcher.matches() )
+                {
+                    String pageref = pagerefMatcher.group( 1 );
+                    processPageref( wordDocument, currentBlock, paragraph,
+                            characterRuns, currentTableLevel, pageref,
+                            separatorMark + 1, endMark );
+                    return;
+                }
+            }
+        }
+
+        StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
+        for ( int i = beginMark; i <= endMark; i++ )
+        {
+            debug.append( "\t" );
+            debug.append( paragraph.getCharacterRun( i ) );
+            debug.append( "\n" );
+        }
+        logger.log( POILogger.WARN, debug );
+
+        // just output field value
+        if ( separatorMark + 1 < endMark )
+            processCharacters( wordDocument, currentTableLevel, paragraph,
+                    currentBlock, characterRuns, separatorMark + 1, endMark );
+
+        return;
+    }
+
+    protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
+            String hyperlink, int i, int endMark );
+
+    protected abstract void processImage( Element currentBlock,
+            boolean inlined, Picture picture );
+
+    protected abstract void processPageref( HWPFDocumentCore wordDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
+            String pageref, int beginTextInclusive, int endTextExclusive );
+
+    protected abstract void processParagraph( HWPFDocumentCore wordDocument,
+            Element parentFopElement, int currentTableLevel,
+            Paragraph paragraph, String bulletText );
+
+    protected abstract void processSection( HWPFDocumentCore wordDocument,
+            Section section, int s );
+
+    protected void processSectionParagraphes( HWPFDocumentCore wordDocument,
+            Element flow, Range range, int currentTableLevel )
+    {
+        final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
+        for ( TableIterator tableIterator = AbstractWordUtils.newTableIterator(
+                range, currentTableLevel + 1 ); tableIterator.hasNext(); )
+        {
+            Table next = tableIterator.next();
+            allTables.put( Integer.valueOf( next.getStartOffset() ), next );
+        }
+
+        final ListTables listTables = wordDocument.getListTables();
+        int currentListInfo = 0;
+
+        final int paragraphs = range.numParagraphs();
+        for ( int p = 0; p < paragraphs; p++ )
+        {
+            Paragraph paragraph = range.getParagraph( p );
+
+            if ( allTables.containsKey( Integer.valueOf( paragraph
+                    .getStartOffset() ) ) )
+            {
+                Table table = allTables.get( Integer.valueOf( paragraph
+                        .getStartOffset() ) );
+                processTable( wordDocument, flow, table, currentTableLevel + 1 );
+                continue;
+            }
+
+            if ( paragraph.isInTable()
+                    && paragraph.getTableLevel() != currentTableLevel )
+            {
+                continue;
+            }
+
+            if ( paragraph.getIlfo() != currentListInfo )
+            {
+                currentListInfo = paragraph.getIlfo();
+            }
+
+            if ( currentListInfo != 0 )
+            {
+                if ( listTables != null )
+                {
+                    final ListFormatOverride listFormatOverride = listTables
+                            .getOverride( paragraph.getIlfo() );
+
+                    String label = AbstractWordUtils.getBulletText( listTables,
+                            paragraph, listFormatOverride.getLsid() );
+
+                    processParagraph( wordDocument, flow, currentTableLevel,
+                            paragraph, label );
+                }
+                else
+                {
+                    logger.log( POILogger.WARN,
+                            "Paragraph #" + paragraph.getStartOffset() + "-"
+                                    + paragraph.getEndOffset()
+                                    + " has reference to list structure #"
+                                    + currentListInfo
+                                    + ", but listTables not defined in file" );
+
+                    processParagraph( wordDocument, flow, currentTableLevel,
+                            paragraph, AbstractWordUtils.EMPTY );
+                }
+            }
+            else
+            {
+                processParagraph( wordDocument, flow, currentTableLevel,
+                        paragraph, AbstractWordUtils.EMPTY );
+            }
+        }
+
+    }
+
+    protected void processSingleSection( HWPFDocumentCore wordDocument,
+            Section section )
+    {
+        processSection( wordDocument, section, 0 );
+    }
+
+    protected abstract void processTable( HWPFDocumentCore wordDocument,
+            Element flow, Table table, int newTableLevel );
+
+    protected int tryField( HWPFDocumentCore wordDocument, Paragraph paragraph,
+            int currentTableLevel, List<CharacterRun> characterRuns,
+            int beginMark, Element currentBlock )
+    {
+        int separatorMark = -1;
+        int endMark = -1;
+        for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
+        {
+            CharacterRun characterRun = paragraph.getCharacterRun( c );
+
+            String text = characterRun.text();
+            if ( text.getBytes().length == 0 )
+                continue;
+
+            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
+            {
+                if ( separatorMark != -1 )
+                {
+                    // double;
+                    return beginMark;
+                }
+
+                separatorMark = c;
+                continue;
+            }
+
+            if ( text.getBytes()[0] == FIELD_END_MARK )
+            {
+                if ( endMark != -1 )
+                {
+                    // double;
+                    return beginMark;
+                }
+
+                endMark = c;
+                break;
+            }
+
+        }
+
+        if ( separatorMark == -1 || endMark == -1 )
+            return beginMark;
+
+        processField( wordDocument, currentBlock, paragraph, currentTableLevel,
+                characterRuns, beginMark, separatorMark, endMark );
+
+        return endMark;
+    }
+
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java
new file mode 100644 (file)
index 0000000..89849c1
--- /dev/null
@@ -0,0 +1,404 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.BorderCode;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.SectionProperties;
+import org.apache.poi.hwpf.usermodel.TableIterator;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+public class AbstractWordUtils
+{
+    static final String EMPTY = "";
+
+    private static final POILogger logger = POILogFactory
+            .getLogger( AbstractWordUtils.class );
+
+    public static final float TWIPS_PER_INCH = 1440.0f;
+    public static final int TWIPS_PER_PT = 20;
+
+    static void closeQuietly( final Closeable closeable )
+    {
+        try
+        {
+            closeable.close();
+        }
+        catch ( Exception exc )
+        {
+            logger.log( POILogger.ERROR, "Unable to close resource: " + exc,
+                    exc );
+        }
+    }
+
+    static boolean equals( String str1, String str2 )
+    {
+        return str1 == null ? str2 == null : str1.equals( str2 );
+    }
+
+    // XXX incorporate into Range
+    static List<CharacterRun> findCharacterRuns( Range range )
+    {
+        final int min = range.getStartOffset();
+        final int max = range.getEndOffset();
+
+        List<CharacterRun> result = new ArrayList<CharacterRun>();
+        List<CHPX> chpxs = getCharacters( range );
+        for ( int i = 0; i < chpxs.size(); i++ )
+        {
+            CHPX chpx = chpxs.get( i );
+            if ( chpx == null )
+                continue;
+
+            if ( Math.max( min, chpx.getStart() ) <= Math.min( max,
+                    chpx.getEnd() ) )
+            {
+                final CharacterRun characterRun = getCharacterRun( range, chpx );
+
+                if ( characterRun == null )
+                    continue;
+
+                result.add( characterRun );
+            }
+        }
+
+        return result;
+    }
+
+    public static String getBorderType( BorderCode borderCode )
+    {
+        if ( borderCode == null )
+            throw new IllegalArgumentException( "borderCode is null" );
+
+        switch ( borderCode.getBorderType() )
+        {
+        case 1:
+        case 2:
+            return "solid";
+        case 3:
+            return "double";
+        case 5:
+            return "solid";
+        case 6:
+            return "dotted";
+        case 7:
+        case 8:
+            return "dashed";
+        case 9:
+            return "dotted";
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+        case 16:
+        case 17:
+        case 18:
+        case 19:
+            return "double";
+        case 20:
+            return "solid";
+        case 21:
+            return "double";
+        case 22:
+            return "dashed";
+        case 23:
+            return "dashed";
+        case 24:
+            return "ridge";
+        case 25:
+            return "grooved";
+        default:
+            return "solid";
+        }
+    }
+
+    public static String getBorderWidth( BorderCode borderCode )
+    {
+        int lineWidth = borderCode.getLineWidth();
+        int pt = lineWidth / 8;
+        int pte = lineWidth - pt * 8;
+
+        StringBuilder stringBuilder = new StringBuilder();
+        stringBuilder.append( pt );
+        stringBuilder.append( "." );
+        stringBuilder.append( 1000 / 8 * pte );
+        stringBuilder.append( "pt" );
+        return stringBuilder.toString();
+    }
+
+    public static String getBulletText( ListTables listTables,
+            Paragraph paragraph, int listId )
+    {
+        final ListLevel listLevel = listTables.getLevel( listId,
+                paragraph.getIlvl() );
+
+        if ( listLevel.getNumberText() == null )
+            return EMPTY;
+
+        StringBuffer bulletBuffer = new StringBuffer();
+        char[] xst = listLevel.getNumberText().toCharArray();
+        for ( char element : xst )
+        {
+            if ( element < 9 )
+            {
+                ListLevel numLevel = listTables.getLevel( listId, element );
+
+                int num = numLevel.getStartAt();
+                bulletBuffer.append( NumberFormatter.getNumber( num,
+                        listLevel.getNumberFormat() ) );
+
+                if ( numLevel == listLevel )
+                {
+                    numLevel.setStartAt( numLevel.getStartAt() + 1 );
+                }
+
+            }
+            else
+            {
+                bulletBuffer.append( element );
+            }
+        }
+
+        byte follow = getIxchFollow( listLevel );
+        switch ( follow )
+        {
+        case 0:
+            bulletBuffer.append( "\t" );
+            break;
+        case 1:
+            bulletBuffer.append( " " );
+            break;
+        default:
+            break;
+        }
+
+        return bulletBuffer.toString();
+    }
+
+    private static CharacterRun getCharacterRun( Range range, CHPX chpx )
+    {
+        try
+        {
+            Method method = Range.class.getDeclaredMethod( "getCharacterRun",
+                    CHPX.class );
+            method.setAccessible( true );
+            return (CharacterRun) method.invoke( range, chpx );
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+    private static List<CHPX> getCharacters( Range range )
+    {
+        try
+        {
+            Field field = Range.class.getDeclaredField( "_characters" );
+            field.setAccessible( true );
+            return (List<CHPX>) field.get( range );
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+    public static String getColor( int ico )
+    {
+        switch ( ico )
+        {
+        case 1:
+            return "black";
+        case 2:
+            return "blue";
+        case 3:
+            return "cyan";
+        case 4:
+            return "green";
+        case 5:
+            return "magenta";
+        case 6:
+            return "red";
+        case 7:
+            return "yellow";
+        case 8:
+            return "white";
+        case 9:
+            return "darkblue";
+        case 10:
+            return "darkcyan";
+        case 11:
+            return "darkgreen";
+        case 12:
+            return "darkmagenta";
+        case 13:
+            return "darkred";
+        case 14:
+            return "darkyellow";
+        case 15:
+            return "darkgray";
+        case 16:
+            return "lightgray";
+        default:
+            return "black";
+        }
+    }
+
+    public static byte getIxchFollow( ListLevel listLevel )
+    {
+        try
+        {
+            Field field = ListLevel.class.getDeclaredField( "_ixchFollow" );
+            field.setAccessible( true );
+            return ( (Byte) field.get( listLevel ) ).byteValue();
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+    public static String getJustification( int js )
+    {
+        switch ( js )
+        {
+        case 0:
+            return "start";
+        case 1:
+            return "center";
+        case 2:
+            return "end";
+        case 3:
+        case 4:
+            return "justify";
+        case 5:
+            return "center";
+        case 6:
+            return "left";
+        case 7:
+            return "start";
+        case 8:
+            return "end";
+        case 9:
+            return "justify";
+        }
+        return "";
+    }
+
+    public static String getListItemNumberLabel( int number, int format )
+    {
+
+        if ( format != 0 )
+            System.err.println( "NYI: toListItemNumberLabel(): " + format );
+
+        return String.valueOf( number );
+    }
+
+    public static SectionProperties getSectionProperties( Section section )
+    {
+        try
+        {
+            Field field = Section.class.getDeclaredField( "_props" );
+            field.setAccessible( true );
+            return (SectionProperties) field.get( section );
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+    static boolean isEmpty( String str )
+    {
+        return str == null || str.length() == 0;
+    }
+
+    static boolean isNotEmpty( String str )
+    {
+        return !isEmpty( str );
+    }
+
+    public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
+    {
+        final FileInputStream istream = new FileInputStream( docFile );
+        try
+        {
+            return loadDoc( istream );
+        }
+        finally
+        {
+            closeQuietly( istream );
+        }
+    }
+
+    public static HWPFDocumentCore loadDoc( InputStream inputStream )
+            throws IOException
+    {
+        final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
+                .verifyAndBuildPOIFS( inputStream );
+        try
+        {
+            return new HWPFDocument( poifsFileSystem );
+        }
+        catch ( OldWordFileFormatException exc )
+        {
+            return new HWPFOldDocument( poifsFileSystem );
+        }
+    }
+
+    public static TableIterator newTableIterator( Range range, int level )
+    {
+        try
+        {
+            Constructor<TableIterator> constructor = TableIterator.class
+                    .getDeclaredConstructor( Range.class, int.class );
+            constructor.setAccessible( true );
+            return constructor.newInstance( range, Integer.valueOf( level ) );
+        }
+        catch ( Exception exc )
+        {
+            throw new Error( exc );
+        }
+    }
+
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java
new file mode 100644 (file)
index 0000000..5e474bf
--- /dev/null
@@ -0,0 +1,201 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+public class FoDocumentFacade
+{
+    private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
+
+    protected final Document document;
+    protected final Element layoutMasterSet;
+    protected final Element root;
+
+    public FoDocumentFacade( Document document )
+    {
+        this.document = document;
+
+        root = document.createElementNS( NS_XSLFO, "fo:root" );
+        document.appendChild( root );
+
+        layoutMasterSet = document.createElementNS( NS_XSLFO,
+                "fo:layout-master-set" );
+        root.appendChild( layoutMasterSet );
+    }
+
+    public Element addFlowToPageSequence( final Element pageSequence,
+            String flowName )
+    {
+        final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" );
+        flow.setAttribute( "flow-name", flowName );
+        pageSequence.appendChild( flow );
+
+        return flow;
+    }
+
+    public Element addListItem( Element listBlock )
+    {
+        Element result = createListItem();
+        listBlock.appendChild( result );
+        return result;
+    }
+
+    public Element addListItemBody( Element listItem )
+    {
+        Element result = createListItemBody();
+        listItem.appendChild( result );
+        return result;
+    }
+
+    public Element addListItemLabel( Element listItem, String text )
+    {
+        Element result = createListItemLabel( text );
+        listItem.appendChild( result );
+        return result;
+    }
+
+    public Element addPageSequence( String pageMaster )
+    {
+        final Element pageSequence = document.createElementNS( NS_XSLFO,
+                "fo:page-sequence" );
+        pageSequence.setAttribute( "master-reference", pageMaster );
+        root.appendChild( pageSequence );
+        return pageSequence;
+    }
+
+    public Element addRegionBody( Element pageMaster )
+    {
+        final Element regionBody = document.createElementNS( NS_XSLFO,
+                "fo:region-body" );
+        pageMaster.appendChild( regionBody );
+
+        return regionBody;
+    }
+
+    public Element addSimplePageMaster( String masterName )
+    {
+        final Element simplePageMaster = document.createElementNS( NS_XSLFO,
+                "fo:simple-page-master" );
+        simplePageMaster.setAttribute( "master-name", masterName );
+        layoutMasterSet.appendChild( simplePageMaster );
+
+        return simplePageMaster;
+    }
+
+    protected Element createBasicLinkExternal( String externalDestination )
+    {
+        final Element basicLink = document.createElementNS( NS_XSLFO,
+                "fo:basic-link" );
+        basicLink.setAttribute( "external-destination", externalDestination );
+        return basicLink;
+    }
+
+    public Element createBasicLinkInternal( String internalDestination )
+    {
+        final Element basicLink = document.createElementNS( NS_XSLFO,
+                "fo:basic-link" );
+        basicLink.setAttribute( "internal-destination", internalDestination );
+        return basicLink;
+    }
+
+    public Element createBlock()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:block" );
+    }
+
+    public Element createExternalGraphic( String source )
+    {
+        Element result = document.createElementNS( NS_XSLFO,
+                "fo:external-graphic" );
+        result.setAttribute( "src", "url('" + source + "')" );
+        return result;
+    }
+
+    public Element createInline()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:inline" );
+    }
+
+    public Element createLeader()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:leader" );
+    }
+
+    public Element createListBlock()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:list-block" );
+    }
+
+    public Element createListItem()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:list-item" );
+    }
+
+    public Element createListItemBody()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:list-item-body" );
+    }
+
+    public Element createListItemLabel( String text )
+    {
+        Element result = document.createElementNS( NS_XSLFO,
+                "fo:list-item-label" );
+        Element block = createBlock();
+        block.appendChild( document.createTextNode( text ) );
+        result.appendChild( block );
+        return result;
+    }
+
+    protected Element createTable()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:table" );
+    }
+
+    protected Element createTableBody()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:table-body" );
+    }
+
+    protected Element createTableCell()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:table-cell" );
+    }
+
+    protected Element createTableHeader()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:table-header" );
+    }
+
+    protected Element createTableRow()
+    {
+        return document.createElementNS( NS_XSLFO, "fo:table-row" );
+    }
+
+    protected Text createText( String data )
+    {
+        return document.createTextNode( data );
+    }
+
+    public Document getDocument()
+    {
+        return document;
+    }
+
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java
new file mode 100644 (file)
index 0000000..5e2b1f0
--- /dev/null
@@ -0,0 +1,107 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+public class HtmlDocumentFacade
+{
+
+    protected final Element body;
+    protected final Document document;
+    protected final Element head;
+    protected final Element html;
+
+    public HtmlDocumentFacade( Document document )
+    {
+        this.document = document;
+
+        html = document.createElement( "html" );
+        document.appendChild( html );
+
+        body = document.createElement( "body" );
+        head = document.createElement( "head" );
+
+        html.appendChild( head );
+        html.appendChild( body );
+    }
+
+    public Element createHyperlink( String internalDestination )
+    {
+        final Element basicLink = document.createElement( "a" );
+        basicLink.setAttribute( "href", internalDestination );
+        return basicLink;
+    }
+
+    public Element createListItem()
+    {
+        return document.createElement( "li" );
+    }
+
+    public Element createParagraph()
+    {
+        return document.createElement( "p" );
+    }
+
+    public Element createTable()
+    {
+        return document.createElement( "table" );
+    }
+
+    public Element createTableBody()
+    {
+        return document.createElement( "tbody" );
+    }
+
+    public Element createTableCell()
+    {
+        return document.createElement( "td" );
+    }
+
+    public Element createTableHeader()
+    {
+        return document.createElement( "thead" );
+    }
+
+    public Element createTableHeaderCell()
+    {
+        return document.createElement( "th" );
+    }
+
+    public Element createTableRow()
+    {
+        return document.createElement( "tr" );
+    }
+
+    public Text createText( String data )
+    {
+        return document.createTextNode( data );
+    }
+
+    public Element createUnorderedList()
+    {
+        return document.createElement( "ul" );
+    }
+
+    public Document getDocument()
+    {
+        return document;
+    }
+
+}
index 4189d7c32d330e843178be1a61b6c5d31826629e..67f6bb17d15de262ec89b7afb84db0a2ac9410d3 100644 (file)
@@ -1,32 +1,27 @@
-/*
- *  ====================================================================
- *    Licensed to the Apache Software Foundation (ASF) under one or more
- *    contributor license agreements.  See the NOTICE file distributed with
- *    this work for additional information regarding copyright ownership.
- *    The ASF licenses this file to You under the Apache License, Version 2.0
- *    (the "License"); you may not use this file except in compliance with
- *    the License.  You may obtain a copy of the License at
- *
- *        http://www.apache.org/licenses/LICENSE-2.0
- *
- *    Unless required by applicable law or agreed to in writing, software
- *    distributed under the License is distributed on an "AS IS" BASIS,
- *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *    See the License for the specific language governing permissions and
- *    limitations under the License.
- * ====================================================================
- */
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
 package org.apache.poi.hwpf.extractor;
 
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.FileWriter;
-import java.io.IOException;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Stack;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.transform.OutputKeys;
@@ -36,8 +31,10 @@ import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 
 import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
 import org.apache.poi.hwpf.model.ListFormatOverride;
 import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.BorderCode;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Picture;
@@ -54,12 +51,10 @@ import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Text;
 
-import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH;
-
 /**
  * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
  */
-public class WordToFoExtractor extends AbstractToFoExtractor
+public class WordToFoExtractor extends AbstractWordExtractor
 {
 
     /**
@@ -84,35 +79,55 @@ public class WordToFoExtractor extends AbstractToFoExtractor
         }
     }
 
-    private static final byte BEL_MARK = 7;
-
-    private static final byte FIELD_BEGIN_MARK = 19;
-
-    private static final byte FIELD_END_MARK = 21;
-
-    private static final byte FIELD_SEPARATOR_MARK = 20;
-
     private static final POILogger logger = POILogFactory
             .getLogger( WordToFoExtractor.class );
 
-    private static HWPFDocument loadDoc( File docFile ) throws IOException
+    public static String getBorderType( BorderCode borderCode )
     {
-        final FileInputStream istream = new FileInputStream( docFile );
-        try
+        if ( borderCode == null )
+            throw new IllegalArgumentException( "borderCode is null" );
+
+        switch ( borderCode.getBorderType() )
         {
-            return new HWPFDocument( istream );
-        }
-        finally
-        {
-            try
-            {
-                istream.close();
-            }
-            catch ( Exception exc )
-            {
-                logger.log( POILogger.ERROR,
-                        "Unable to close FileInputStream: " + exc, exc );
-            }
+        case 1:
+        case 2:
+            return "solid";
+        case 3:
+            return "double";
+        case 5:
+            return "solid";
+        case 6:
+            return "dotted";
+        case 7:
+        case 8:
+            return "dashed";
+        case 9:
+            return "dotted";
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+        case 16:
+        case 17:
+        case 18:
+        case 19:
+            return "double";
+        case 20:
+            return "solid";
+        case 21:
+            return "double";
+        case 22:
+            return "dashed";
+        case 23:
+            return "dashed";
+        case 24:
+            return "ridge";
+        case 25:
+            return "grooved";
+        default:
+            return "solid";
         }
     }
 
@@ -160,7 +175,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
 
     static Document process( File docFile ) throws Exception
     {
-        final HWPFDocument hwpfDocument = loadDoc( docFile );
+        final HWPFDocumentCore hwpfDocument = WordToFoUtils.loadDoc( docFile );
         WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
                 DocumentBuilderFactory.newInstance().newDocumentBuilder()
                         .newDocument() );
@@ -170,6 +185,8 @@ public class WordToFoExtractor extends AbstractToFoExtractor
 
     private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
 
+    protected final FoDocumentFacade foDocumentFacade;
+
     /**
      * Creates new instance of {@link WordToFoExtractor}. Can be used for output
      * several {@link HWPFDocument}s into single FO document.
@@ -180,27 +197,28 @@ public class WordToFoExtractor extends AbstractToFoExtractor
      */
     public WordToFoExtractor( Document document )
     {
-        super( document );
+        this.foDocumentFacade = new FoDocumentFacade( document );
     }
 
     protected String createPageMaster( SectionProperties sep, String type,
             int section )
     {
-        float height = sep.getYaPage() / TWIPS_PER_INCH;
-        float width = sep.getXaPage() / TWIPS_PER_INCH;
-        float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
-        float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH;
-        float topMargin = sep.getDyaTop() / TWIPS_PER_INCH;
-        float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH;
+        float height = sep.getYaPage() / WordToFoUtils.TWIPS_PER_INCH;
+        float width = sep.getXaPage() / WordToFoUtils.TWIPS_PER_INCH;
+        float leftMargin = sep.getDxaLeft() / WordToFoUtils.TWIPS_PER_INCH;
+        float rightMargin = sep.getDxaRight() / WordToFoUtils.TWIPS_PER_INCH;
+        float topMargin = sep.getDyaTop() / WordToFoUtils.TWIPS_PER_INCH;
+        float bottomMargin = sep.getDyaBottom() / WordToFoUtils.TWIPS_PER_INCH;
 
         // add these to the header
         String pageMasterName = type + "-page" + section;
 
-        Element pageMaster = addSimplePageMaster( pageMasterName );
+        Element pageMaster = foDocumentFacade
+                .addSimplePageMaster( pageMasterName );
         pageMaster.setAttribute( "page-height", height + "in" );
         pageMaster.setAttribute( "page-width", width + "in" );
 
-        Element regionBody = addRegionBody( pageMaster );
+        Element regionBody = foDocumentFacade.addRegionBody( pageMaster );
         regionBody.setAttribute( "margin", topMargin + "in " + rightMargin
                 + "in " + bottomMargin + "in " + leftMargin + "in" );
 
@@ -216,12 +234,13 @@ public class WordToFoExtractor extends AbstractToFoExtractor
 
         if ( sep.getCcolM1() > 0 )
         {
-            regionBody
-                    .setAttribute( "column-count", "" + (sep.getCcolM1() + 1) );
+            regionBody.setAttribute( "column-count", ""
+                    + ( sep.getCcolM1() + 1 ) );
             if ( sep.getFEvenlySpaced() )
             {
                 regionBody.setAttribute( "column-gap",
-                        (sep.getDxaColumns() / TWIPS_PER_INCH) + "in" );
+                        ( sep.getDxaColumns() / WordToFoUtils.TWIPS_PER_INCH )
+                                + "in" );
             }
             else
             {
@@ -232,171 +251,55 @@ public class WordToFoExtractor extends AbstractToFoExtractor
         return pageMasterName;
     }
 
-    protected boolean processCharacters( HWPFDocument hwpfDocument,
-            int currentTableLevel, Paragraph paragraph, final Element block,
-            final int start, final int end )
+    public Document getDocument()
     {
-        boolean haveAnyText = false;
-
-        for ( int c = start; c < end; c++ )
-        {
-            CharacterRun characterRun = paragraph.getCharacterRun( c );
-
-            if ( hwpfDocument.getPicturesTable().hasPicture( characterRun ) )
-            {
-                Picture picture = hwpfDocument.getPicturesTable()
-                        .extractPicture( characterRun, true );
-
-                processImage( block, characterRun.text().charAt( 0 ) == 0x01,
-                        picture );
-                continue;
-            }
-
-            String text = characterRun.text();
-            if ( text.getBytes().length == 0 )
-                continue;
-
-            if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
-            {
-                int skipTo = tryField( hwpfDocument, paragraph,
-                        currentTableLevel, c, block );
-
-                if ( skipTo != c )
-                {
-                    c = skipTo;
-                    continue;
-                }
-
-                continue;
-            }
-            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
-            {
-                // shall not appear without FIELD_BEGIN_MARK
-                continue;
-            }
-            if ( text.getBytes()[0] == FIELD_END_MARK )
-            {
-                // shall not appear without FIELD_BEGIN_MARK
-                continue;
-            }
-
-            if ( characterRun.isSpecialCharacter() || characterRun.isObj()
-                    || characterRun.isOle2() )
-            {
-                continue;
-            }
-
-            BlockProperies blockProperies = this.blocksProperies.peek();
-            Element inline = createInline();
-            if ( characterRun.isBold() != blockProperies.pBold )
-            {
-                WordToFoUtils.setBold( inline, characterRun.isBold() );
-            }
-            if ( characterRun.isItalic() != blockProperies.pItalic )
-            {
-                WordToFoUtils.setItalic( inline, characterRun.isItalic() );
-            }
-            if ( !WordToFoUtils.equals( characterRun.getFontName(),
-                    blockProperies.pFontName ) )
-            {
-                WordToFoUtils
-                        .setFontFamily( inline, characterRun.getFontName() );
-            }
-            if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
-            {
-                WordToFoUtils.setFontSize( inline,
-                        characterRun.getFontSize() / 2 );
-            }
-            WordToFoUtils.setCharactersProperties( characterRun, inline );
-            block.appendChild( inline );
-
-            if ( text.endsWith( "\r" )
-                    || (text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0) )
-                text = text.substring( 0, text.length() - 1 );
-
-            Text textNode = createText( text );
-            inline.appendChild( textNode );
-
-            haveAnyText |= text.trim().length() != 0;
-        }
-
-        return haveAnyText;
+        return foDocumentFacade.getDocument();
     }
 
-    public void processDocument( HWPFDocument hwpfDocument )
+    @Override
+    protected void outputCharacters( Element block, CharacterRun characterRun,
+            String text )
     {
-        final Range range = hwpfDocument.getRange();
-
-        for ( int s = 0; s < range.numSections(); s++ )
+        BlockProperies blockProperies = this.blocksProperies.peek();
+        Element inline = foDocumentFacade.createInline();
+        if ( characterRun.isBold() != blockProperies.pBold )
         {
-            processSection( hwpfDocument, range.getSection( s ), s );
+            WordToFoUtils.setBold( inline, characterRun.isBold() );
         }
-    }
-
-    protected void processField( HWPFDocument hwpfDocument,
-            Element currentBlock, Paragraph paragraph, int currentTableLevel,
-            int beginMark, int separatorMark, int endMark )
-    {
-
-        Pattern hyperlinkPattern = Pattern
-                .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
-        Pattern pagerefPattern = Pattern
-                .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
-
-        if ( separatorMark - beginMark > 1 )
+        if ( characterRun.isItalic() != blockProperies.pItalic )
         {
-            CharacterRun firstAfterBegin = paragraph
-                    .getCharacterRun( beginMark + 1 );
-
-            final Matcher hyperlinkMatcher = hyperlinkPattern
-                    .matcher( firstAfterBegin.text() );
-            if ( hyperlinkMatcher.matches() )
-            {
-                String hyperlink = hyperlinkMatcher.group( 1 );
-                processHyperlink( hwpfDocument, currentBlock, paragraph,
-                        currentTableLevel, hyperlink, separatorMark + 1,
-                        endMark );
-                return;
-            }
-
-            final Matcher pagerefMatcher = pagerefPattern
-                    .matcher( firstAfterBegin.text() );
-            if ( pagerefMatcher.matches() )
-            {
-                String pageref = pagerefMatcher.group( 1 );
-                processPageref( hwpfDocument, currentBlock, paragraph,
-                        currentTableLevel, pageref, separatorMark + 1, endMark );
-                return;
-            }
+            WordToFoUtils.setItalic( inline, characterRun.isItalic() );
         }
-
-        StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
-        for ( int i = beginMark; i <= endMark; i++ )
+        if ( characterRun.getFontName() != null
+                && !AbstractWordUtils.equals( characterRun.getFontName(),
+                        blockProperies.pFontName ) )
         {
-            debug.append( "\t" );
-            debug.append( paragraph.getCharacterRun( i ) );
-            debug.append( "\n" );
+            WordToFoUtils.setFontFamily( inline, characterRun.getFontName() );
         }
-        logger.log( POILogger.WARN, debug );
-
-        // just output field value
-        if ( separatorMark + 1 < endMark )
-            processCharacters( hwpfDocument, currentTableLevel, paragraph,
-                    currentBlock, separatorMark + 1, endMark );
+        if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
+        {
+            WordToFoUtils.setFontSize( inline, characterRun.getFontSize() / 2 );
+        }
+        WordToFoUtils.setCharactersProperties( characterRun, inline );
+        block.appendChild( inline );
 
-        return;
+        Text textNode = foDocumentFacade.createText( text );
+        inline.appendChild( textNode );
     }
 
-    protected void processHyperlink( HWPFDocument hwpfDocument,
-            Element currentBlock, Paragraph paragraph, int currentTableLevel,
+    protected void processHyperlink( HWPFDocumentCore hwpfDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
             String hyperlink, int beginTextInclusive, int endTextExclusive )
     {
-        Element basicLink = createBasicLinkExternal( hyperlink );
+        Element basicLink = foDocumentFacade
+                .createBasicLinkExternal( hyperlink );
         currentBlock.appendChild( basicLink );
 
         if ( beginTextInclusive < endTextExclusive )
             processCharacters( hwpfDocument, currentTableLevel, paragraph,
-                    basicLink, beginTextInclusive, endTextExclusive );
+                    basicLink, characterRuns, beginTextInclusive,
+                    endTextExclusive );
     }
 
     /**
@@ -422,27 +325,30 @@ public class WordToFoExtractor extends AbstractToFoExtractor
             Picture picture )
     {
         // no default implementation -- skip
-        currentBlock.appendChild( document.createComment( "Image link to '"
-                + picture.suggestFullFileName() + "' can be here" ) );
+        currentBlock.appendChild( foDocumentFacade.getDocument().createComment(
+                "Image link to '" + picture.suggestFullFileName()
+                        + "' can be here" ) );
     }
 
-    protected void processPageref( HWPFDocument hwpfDocument,
-            Element currentBlock, Paragraph paragraph, int currentTableLevel,
+    protected void processPageref( HWPFDocumentCore hwpfDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
             String pageref, int beginTextInclusive, int endTextExclusive )
     {
-        Element basicLink = createBasicLinkInternal( pageref );
+        Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref );
         currentBlock.appendChild( basicLink );
 
         if ( beginTextInclusive < endTextExclusive )
             processCharacters( hwpfDocument, currentTableLevel, paragraph,
-                    basicLink, beginTextInclusive, endTextExclusive );
+                    basicLink, characterRuns, beginTextInclusive,
+                    endTextExclusive );
     }
 
-    protected void processParagraph( HWPFDocument hwpfDocument,
+    protected void processParagraph( HWPFDocumentCore hwpfDocument,
             Element parentFopElement, int currentTableLevel,
             Paragraph paragraph, String bulletText )
     {
-        final Element block = createBlock();
+        final Element block = foDocumentFacade.createBlock();
         parentFopElement.appendChild( block );
 
         WordToFoUtils.setParagraphProperties( paragraph, block );
@@ -480,21 +386,23 @@ public class WordToFoExtractor extends AbstractToFoExtractor
 
             if ( WordToFoUtils.isNotEmpty( bulletText ) )
             {
-                Element inline = createInline();
+                Element inline = foDocumentFacade.createInline();
                 block.appendChild( inline );
 
-                Text textNode = createText( bulletText );
+                Text textNode = foDocumentFacade.createText( bulletText );
                 inline.appendChild( textNode );
 
                 haveAnyText |= bulletText.trim().length() != 0;
             }
 
+            List<CharacterRun> characterRuns = WordToFoUtils
+                    .findCharacterRuns( paragraph );
             haveAnyText = processCharacters( hwpfDocument, currentTableLevel,
-                    paragraph, block, 0, charRuns );
+                    paragraph, block, characterRuns, 0, characterRuns.size() );
 
             if ( !haveAnyText )
             {
-                Element leader = createLeader();
+                Element leader = foDocumentFacade.createLeader();
                 block.appendChild( leader );
             }
         }
@@ -506,20 +414,21 @@ public class WordToFoExtractor extends AbstractToFoExtractor
         return;
     }
 
-    protected void processSection( HWPFDocument hwpfDocument, Section section,
-            int sectionCounter )
+    protected void processSection( HWPFDocumentCore wordDocument,
+            Section section, int sectionCounter )
     {
         String regularPage = createPageMaster(
                 WordToFoUtils.getSectionProperties( section ), "page",
                 sectionCounter );
 
-        Element pageSequence = addPageSequence( regularPage );
-        Element flow = addFlowToPageSequence( pageSequence, "xsl-region-body" );
+        Element pageSequence = foDocumentFacade.addPageSequence( regularPage );
+        Element flow = foDocumentFacade.addFlowToPageSequence( pageSequence,
+                "xsl-region-body" );
 
-        processSectionParagraphes( hwpfDocument, flow, section, 0 );
+        processSectionParagraphes( wordDocument, flow, section, 0 );
     }
 
-    protected void processSectionParagraphes( HWPFDocument hwpfDocument,
+    protected void processSectionParagraphes( HWPFDocument wordDocument,
             Element flow, Range range, int currentTableLevel )
     {
         final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
@@ -530,7 +439,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
             allTables.put( Integer.valueOf( next.getStartOffset() ), next );
         }
 
-        final ListTables listTables = hwpfDocument.getListTables();
+        final ListTables listTables = wordDocument.getListTables();
         int currentListInfo = 0;
 
         final int paragraphs = range.numParagraphs();
@@ -543,7 +452,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
             {
                 Table table = allTables.get( Integer.valueOf( paragraph
                         .getStartOffset() ) );
-                processTable( hwpfDocument, flow, table, currentTableLevel + 1 );
+                processTable( wordDocument, flow, table, currentTableLevel + 1 );
                 continue;
             }
 
@@ -568,7 +477,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
                     String label = WordToFoUtils.getBulletText( listTables,
                             paragraph, listFormatOverride.getLsid() );
 
-                    processParagraph( hwpfDocument, flow, currentTableLevel,
+                    processParagraph( wordDocument, flow, currentTableLevel,
                             paragraph, label );
                 }
                 else
@@ -580,24 +489,24 @@ public class WordToFoExtractor extends AbstractToFoExtractor
                                     + currentListInfo
                                     + ", but listTables not defined in file" );
 
-                    processParagraph( hwpfDocument, flow, currentTableLevel,
+                    processParagraph( wordDocument, flow, currentTableLevel,
                             paragraph, WordToFoUtils.EMPTY );
                 }
             }
             else
             {
-                processParagraph( hwpfDocument, flow, currentTableLevel,
+                processParagraph( wordDocument, flow, currentTableLevel,
                         paragraph, WordToFoUtils.EMPTY );
             }
         }
 
     }
 
-    protected void processTable( HWPFDocument hwpfDocument, Element flow,
+    protected void processTable( HWPFDocumentCore wordDocument, Element flow,
             Table table, int thisTableLevel )
     {
-        Element tableHeader = createTableHeader();
-        Element tableBody = createTableBody();
+        Element tableHeader = foDocumentFacade.createTableHeader();
+        Element tableBody = foDocumentFacade.createTableBody();
 
         final int tableRows = table.numRows();
 
@@ -611,7 +520,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
         {
             TableRow tableRow = table.getRow( r );
 
-            Element tableRowElement = createTableRow();
+            Element tableRowElement = foDocumentFacade.createTableRow();
             WordToFoUtils.setTableRowProperties( tableRow, tableRowElement );
 
             final int rowCells = tableRow.numCells();
@@ -626,7 +535,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
                         && !tableCell.isFirstVerticallyMerged() )
                     continue;
 
-                Element tableCellElement = createTableCell();
+                Element tableCellElement = foDocumentFacade.createTableCell();
                 WordToFoUtils.setTableCellProperties( tableRow, tableCell,
                         tableCellElement, r == 0, r == tableRows - 1, c == 0,
                         c == rowCells - 1 );
@@ -649,9 +558,9 @@ public class WordToFoExtractor extends AbstractToFoExtractor
                 {
                     if ( c == rowCells - 1 && c != maxColumns - 1 )
                     {
-                        tableCellElement
-                                .setAttribute( "number-columns-spanned", ""
-                                        + (maxColumns - c) );
+                        tableCellElement.setAttribute(
+                                "number-columns-spanned", ""
+                                        + ( maxColumns - c ) );
                     }
                 }
 
@@ -673,12 +582,13 @@ public class WordToFoExtractor extends AbstractToFoExtractor
                             + count );
                 }
 
-                processSectionParagraphes( hwpfDocument, tableCellElement,
+                processSectionParagraphes( wordDocument, tableCellElement,
                         tableCell, thisTableLevel );
 
                 if ( !tableCellElement.hasChildNodes() )
                 {
-                    tableCellElement.appendChild( createBlock() );
+                    tableCellElement.appendChild( foDocumentFacade
+                            .createBlock() );
                 }
 
                 tableRowElement.appendChild( tableCellElement );
@@ -694,7 +604,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
             }
         }
 
-        final Element tableElement = createTable();
+        final Element tableElement = foDocumentFacade.createTable();
         if ( tableHeader.hasChildNodes() )
         {
             tableElement.appendChild( tableHeader );
@@ -714,51 +624,4 @@ public class WordToFoExtractor extends AbstractToFoExtractor
         }
     }
 
-    protected int tryField( HWPFDocument hwpfDocument, Paragraph paragraph,
-            int currentTableLevel, int beginMark, Element currentBlock )
-    {
-        int separatorMark = -1;
-        int endMark = -1;
-        for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
-        {
-            CharacterRun characterRun = paragraph.getCharacterRun( c );
-
-            String text = characterRun.text();
-            if ( text.getBytes().length == 0 )
-                continue;
-
-            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
-            {
-                if ( separatorMark != -1 )
-                {
-                    // double;
-                    return beginMark;
-                }
-
-                separatorMark = c;
-                continue;
-            }
-
-            if ( text.getBytes()[0] == FIELD_END_MARK )
-            {
-                if ( endMark != -1 )
-                {
-                    // double;
-                    return beginMark;
-                }
-
-                endMark = c;
-                break;
-            }
-
-        }
-
-        if ( separatorMark == -1 || endMark == -1 )
-            return beginMark;
-
-        processField( hwpfDocument, currentBlock, paragraph, currentTableLevel,
-                beginMark, separatorMark, endMark );
-
-        return endMark;
-    }
 }
index 5acd71113851bfb1a30063797d0a0514519ffb0e..1b3447f006f4209b6574d1279d48fd02fb81fdff 100644 (file)
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
 package org.apache.poi.hwpf.extractor;
 
-import java.lang.reflect.Constructor;
-import java.lang.reflect.Field;
-
-import org.apache.poi.hwpf.model.ListLevel;
-import org.apache.poi.hwpf.model.ListTables;
 import org.apache.poi.hwpf.usermodel.BorderCode;
 import org.apache.poi.hwpf.usermodel.CharacterProperties;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Picture;
-import org.apache.poi.hwpf.usermodel.Range;
-import org.apache.poi.hwpf.usermodel.Section;
-import org.apache.poi.hwpf.usermodel.SectionProperties;
 import org.apache.poi.hwpf.usermodel.TableCell;
-import org.apache.poi.hwpf.usermodel.TableIterator;
 import org.apache.poi.hwpf.usermodel.TableRow;
 import org.w3c.dom.Element;
 
-public class WordToFoUtils {
-    static final String EMPTY = "";
-
-    public static final float TWIPS_PER_INCH = 1440.0f;
-
-    public static final int TWIPS_PER_PT = 20;
-
-    static boolean equals(String str1, String str2) {
-       return str1 == null ? str2 == null : str1.equals(str2);
-    }
-
-    public static String getBorderType(BorderCode borderCode) {
-       if (borderCode == null)
-           throw new IllegalArgumentException("borderCode is null");
-
-       switch (borderCode.getBorderType()) {
-       case 1:
-       case 2:
-           return "solid";
-       case 3:
-           return "double";
-       case 5:
-           return "solid";
-       case 6:
-           return "dotted";
-       case 7:
-       case 8:
-           return "dashed";
-       case 9:
-           return "dotted";
-       case 10:
-       case 11:
-       case 12:
-       case 13:
-       case 14:
-       case 15:
-       case 16:
-       case 17:
-       case 18:
-       case 19:
-           return "double";
-       case 20:
-           return "solid";
-       case 21:
-           return "double";
-       case 22:
-           return "dashed";
-       case 23:
-           return "dashed";
-       case 24:
-           return "ridge";
-       case 25:
-           return "grooved";
-       default:
-           return "solid";
-       }
-    }
-
-    public static String getBorderWidth(BorderCode borderCode) {
-       int lineWidth = borderCode.getLineWidth();
-       int pt = lineWidth / 8;
-       int pte = lineWidth - pt * 8;
-
-       StringBuilder stringBuilder = new StringBuilder();
-       stringBuilder.append(pt);
-       stringBuilder.append(".");
-       stringBuilder.append(1000 / 8 * pte);
-       stringBuilder.append("pt");
-       return stringBuilder.toString();
-    }
-
-    public static String getBulletText(ListTables listTables,
-           Paragraph paragraph, int listId) {
-       final ListLevel listLevel = listTables.getLevel(listId,
-               paragraph.getIlvl());
-
-       if (listLevel.getNumberText() == null)
-           return EMPTY;
-
-       StringBuffer bulletBuffer = new StringBuffer();
-       char[] xst = listLevel.getNumberText().toCharArray();
-       for (char element : xst) {
-           if (element < 9) {
-               ListLevel numLevel = listTables.getLevel(listId, element);
-
-               int num = numLevel.getStartAt();
-               bulletBuffer.append(NumberFormatter.getNumber(num,
-                       listLevel.getNumberFormat()));
-
-               if (numLevel == listLevel) {
-                   numLevel.setStartAt(numLevel.getStartAt() + 1);
-               }
-
-           } else {
-               bulletBuffer.append(element);
-           }
-       }
-
-       byte follow = getIxchFollow(listLevel);
-       switch (follow) {
-       case 0:
-           bulletBuffer.append("\t");
-           break;
-       case 1:
-           bulletBuffer.append(" ");
-           break;
-       default:
-           break;
-       }
-
-       return bulletBuffer.toString();
-    }
-
-    public static String getColor(int ico) {
-       switch (ico) {
-       case 1:
-           return "black";
-       case 2:
-           return "blue";
-       case 3:
-           return "cyan";
-       case 4:
-           return "green";
-       case 5:
-           return "magenta";
-       case 6:
-           return "red";
-       case 7:
-           return "yellow";
-       case 8:
-           return "white";
-       case 9:
-           return "darkblue";
-       case 10:
-           return "darkcyan";
-       case 11:
-           return "darkgreen";
-       case 12:
-           return "darkmagenta";
-       case 13:
-           return "darkred";
-       case 14:
-           return "darkyellow";
-       case 15:
-           return "darkgray";
-       case 16:
-           return "lightgray";
-       default:
-           return "black";
-       }
+public class WordToFoUtils extends AbstractWordUtils
+{
+    public static void setBold( final Element element, final boolean bold )
+    {
+        element.setAttribute( "font-weight", bold ? "bold" : "normal" );
     }
 
-    public static byte getIxchFollow(ListLevel listLevel) {
-       try {
-           Field field = ListLevel.class.getDeclaredField("_ixchFollow");
-           field.setAccessible(true);
-           return ((Byte) field.get(listLevel)).byteValue();
-       } catch (Exception exc) {
-           throw new Error(exc);
-       }
-    }
-
-    public static String getJustification(int js) {
-        switch (js) {
-        case 0:
-            return "start";
-        case 1:
-            return "center";
-        case 2:
-            return "end";
-        case 3:
-        case 4:
-            return "justify";
-        case 5:
-            return "center";
-        case 6:
-            return "left";
-        case 7:
-            return "start";
-        case 8:
-            return "end";
-        case 9:
-            return "justify";
+    public static void setBorder( Element element, BorderCode borderCode,
+            String where )
+    {
+        if ( element == null )
+            throw new IllegalArgumentException( "element is null" );
+
+        if ( borderCode == null || borderCode.getBorderType() == 0 )
+            return;
+
+        if ( isEmpty( where ) )
+        {
+            element.setAttribute( "border-style", getBorderType( borderCode ) );
+            element.setAttribute( "border-color",
+                    getColor( borderCode.getColor() ) );
+            element.setAttribute( "border-width", getBorderWidth( borderCode ) );
+        }
+        else
+        {
+            element.setAttribute( "border-" + where + "-style",
+                    getBorderType( borderCode ) );
+            element.setAttribute( "border-" + where + "-color",
+                    getColor( borderCode.getColor() ) );
+            element.setAttribute( "border-" + where + "-width",
+                    getBorderWidth( borderCode ) );
         }
-        return "";
-    }
-
-    public static String getListItemNumberLabel(int number, int format) {
-
-       if (format != 0)
-           System.err.println("NYI: toListItemNumberLabel(): " + format);
-
-       return String.valueOf(number);
-    }
-
-    public static SectionProperties getSectionProperties(Section section) {
-       try {
-           Field field = Section.class.getDeclaredField("_props");
-           field.setAccessible(true);
-           return (SectionProperties) field.get(section);
-       } catch (Exception exc) {
-           throw new Error(exc);
-       }
-    }
-
-    static boolean isEmpty(String str) {
-       return str == null || str.length() == 0;
-    }
-
-    static boolean isNotEmpty(String str) {
-       return !isEmpty(str);
-    }
-
-    public static TableIterator newTableIterator(Range range, int level) {
-       try {
-           Constructor<TableIterator> constructor = TableIterator.class
-                   .getDeclaredConstructor(Range.class, int.class);
-           constructor.setAccessible(true);
-           return constructor.newInstance(range, Integer.valueOf(level));
-       } catch (Exception exc) {
-           throw new Error(exc);
-       }
-    }
-
-    public static void setBold(final Element element, final boolean bold) {
-       element.setAttribute("font-weight", bold ? "bold" : "normal");
-    }
-
-    public static void setBorder(Element element, BorderCode borderCode,
-           String where) {
-       if (element == null)
-           throw new IllegalArgumentException("element is null");
-
-       if (borderCode == null)
-           return;
-
-       if (isEmpty(where)) {
-           element.setAttribute("border-style", getBorderType(borderCode));
-           element.setAttribute("border-color",
-                   getColor(borderCode.getColor()));
-           element.setAttribute("border-width", getBorderWidth(borderCode));
-       } else {
-           element.setAttribute("border-" + where + "-style",
-                   getBorderType(borderCode));
-           element.setAttribute("border-" + where + "-color",
-                   getColor(borderCode.getColor()));
-           element.setAttribute("border-" + where + "-width",
-                   getBorderWidth(borderCode));
-       }
     }
 
-    public static void setCharactersProperties(final CharacterRun characterRun,
-            final Element inline) {
+    public static void setCharactersProperties(
+            final CharacterRun characterRun, final Element inline )
+    {
         final CharacterProperties clonedProperties = characterRun
                 .cloneProperties();
         StringBuilder textDecorations = new StringBuilder();
 
-        setBorder(inline, clonedProperties.getBrc(), EMPTY);
+        setBorder( inline, clonedProperties.getBrc(), EMPTY );
 
-        if (characterRun.isCapitalized()) {
-            inline.setAttribute("text-transform", "uppercase");
+        if ( characterRun.isCapitalized() )
+        {
+            inline.setAttribute( "text-transform", "uppercase" );
         }
-        if (characterRun.isHighlighted()) {
-            inline.setAttribute("background-color",
-                    getColor(clonedProperties.getIcoHighlight()));
+        if ( characterRun.isHighlighted() )
+        {
+            inline.setAttribute( "background-color",
+                    getColor( clonedProperties.getIcoHighlight() ) );
         }
-        if (characterRun.isStrikeThrough()) {
-            if (textDecorations.length() > 0)
-                textDecorations.append(" ");
-            textDecorations.append("line-through");
+        if ( characterRun.isStrikeThrough() )
+        {
+            if ( textDecorations.length() > 0 )
+                textDecorations.append( " " );
+            textDecorations.append( "line-through" );
         }
-        if (characterRun.isShadowed()) {
-            inline.setAttribute("text-shadow", characterRun.getFontSize() / 24
-                    + "pt");
+        if ( characterRun.isShadowed() )
+        {
+            inline.setAttribute( "text-shadow", characterRun.getFontSize() / 24
+                    + "pt" );
         }
-        if (characterRun.isSmallCaps()) {
-            inline.setAttribute("font-variant", "small-caps");
+        if ( characterRun.isSmallCaps() )
+        {
+            inline.setAttribute( "font-variant", "small-caps" );
         }
-        if (characterRun.getSubSuperScriptIndex() == 1) {
-            inline.setAttribute("baseline-shift", "super");
-            inline.setAttribute("font-size", "smaller");
+        if ( characterRun.getSubSuperScriptIndex() == 1 )
+        {
+            inline.setAttribute( "baseline-shift", "super" );
+            inline.setAttribute( "font-size", "smaller" );
         }
-        if (characterRun.getSubSuperScriptIndex() == 2) {
-            inline.setAttribute("baseline-shift", "sub");
-            inline.setAttribute("font-size", "smaller");
+        if ( characterRun.getSubSuperScriptIndex() == 2 )
+        {
+            inline.setAttribute( "baseline-shift", "sub" );
+            inline.setAttribute( "font-size", "smaller" );
         }
-        if (characterRun.getUnderlineCode() > 0) {
-            if (textDecorations.length() > 0)
-                textDecorations.append(" ");
-            textDecorations.append("underline");
+        if ( characterRun.getUnderlineCode() > 0 )
+        {
+            if ( textDecorations.length() > 0 )
+                textDecorations.append( " " );
+            textDecorations.append( "underline" );
         }
-        if (characterRun.isVanished()) {
-            inline.setAttribute("visibility", "hidden");
+        if ( characterRun.isVanished() )
+        {
+            inline.setAttribute( "visibility", "hidden" );
         }
-        if (textDecorations.length() > 0) {
-            inline.setAttribute("text-decoration", textDecorations.toString());
+        if ( textDecorations.length() > 0 )
+        {
+            inline.setAttribute( "text-decoration", textDecorations.toString() );
         }
     }
 
-    public static void setFontFamily(final Element element,
-           final String fontFamily) {
-       element.setAttribute("font-family", fontFamily);
+    public static void setFontFamily( final Element element,
+            final String fontFamily )
+    {
+        if ( isEmpty( fontFamily ) )
+            return;
+
+        element.setAttribute( "font-family", fontFamily );
     }
 
-    public static void setFontSize(final Element element, final int fontSize) {
-       element.setAttribute("font-size", String.valueOf(fontSize));
+    public static void setFontSize( final Element element, final int fontSize )
+    {
+        element.setAttribute( "font-size", String.valueOf( fontSize ) );
     }
 
-    public static void setIndent(Paragraph paragraph, Element block) {
-       if (paragraph.getFirstLineIndent() != 0) {
-           block.setAttribute(
-                   "text-indent",
-                   String.valueOf(paragraph.getFirstLineIndent()
-                           / TWIPS_PER_PT)
-                           + "pt");
-       }
-       if (paragraph.getIndentFromLeft() != 0) {
-           block.setAttribute(
-                   "start-indent",
-                   String.valueOf(paragraph.getIndentFromLeft() / TWIPS_PER_PT)
-                           + "pt");
-       }
-       if (paragraph.getIndentFromRight() != 0) {
-           block.setAttribute(
-                   "end-indent",
-                   String.valueOf(paragraph.getIndentFromRight()
-                           / TWIPS_PER_PT)
-                           + "pt");
-       }
-       if (paragraph.getSpacingBefore() != 0) {
-           block.setAttribute("space-before",
-                   String.valueOf(paragraph.getSpacingBefore() / TWIPS_PER_PT)
-                           + "pt");
-       }
-       if (paragraph.getSpacingAfter() != 0) {
-           block.setAttribute("space-after",
-                   String.valueOf(paragraph.getSpacingAfter() / TWIPS_PER_PT)
-                           + "pt");
-       }
+    public static void setIndent( Paragraph paragraph, Element block )
+    {
+        if ( paragraph.getFirstLineIndent() != 0 )
+        {
+            block.setAttribute(
+                    "text-indent",
+                    String.valueOf( paragraph.getFirstLineIndent()
+                            / TWIPS_PER_PT )
+                            + "pt" );
+        }
+        if ( paragraph.getIndentFromLeft() != 0 )
+        {
+            block.setAttribute(
+                    "start-indent",
+                    String.valueOf( paragraph.getIndentFromLeft()
+                            / TWIPS_PER_PT )
+                            + "pt" );
+        }
+        if ( paragraph.getIndentFromRight() != 0 )
+        {
+            block.setAttribute(
+                    "end-indent",
+                    String.valueOf( paragraph.getIndentFromRight()
+                            / TWIPS_PER_PT )
+                            + "pt" );
+        }
+        if ( paragraph.getSpacingBefore() != 0 )
+        {
+            block.setAttribute(
+                    "space-before",
+                    String.valueOf( paragraph.getSpacingBefore() / TWIPS_PER_PT )
+                            + "pt" );
+        }
+        if ( paragraph.getSpacingAfter() != 0 )
+        {
+            block.setAttribute( "space-after",
+                    String.valueOf( paragraph.getSpacingAfter() / TWIPS_PER_PT )
+                            + "pt" );
+        }
     }
 
-    public static void setItalic(final Element element, final boolean italic) {
-       element.setAttribute("font-style", italic ? "italic" : "normal");
+    public static void setItalic( final Element element, final boolean italic )
+    {
+        element.setAttribute( "font-style", italic ? "italic" : "normal" );
     }
 
-    public static void setJustification(Paragraph paragraph,
-            final Element element) {
-        String justification = getJustification(paragraph.getJustification());
-        if (isNotEmpty(justification))
-            element.setAttribute("text-align", justification);
+    public static void setJustification( Paragraph paragraph,
+            final Element element )
+    {
+        String justification = getJustification( paragraph.getJustification() );
+        if ( isNotEmpty( justification ) )
+            element.setAttribute( "text-align", justification );
     }
 
-    public static void setParagraphProperties(Paragraph paragraph, Element block) {
-       setIndent(paragraph, block);
-       setJustification(paragraph, block);
+    public static void setParagraphProperties( Paragraph paragraph,
+            Element block )
+    {
+        setIndent( paragraph, block );
+        setJustification( paragraph, block );
 
-       setBorder(block, paragraph.getBottomBorder(), "bottom");
-       setBorder(block, paragraph.getLeftBorder(), "left");
-       setBorder(block, paragraph.getRightBorder(), "right");
-       setBorder(block, paragraph.getTopBorder(), "top");
+        setBorder( block, paragraph.getBottomBorder(), "bottom" );
+        setBorder( block, paragraph.getLeftBorder(), "left" );
+        setBorder( block, paragraph.getRightBorder(), "right" );
+        setBorder( block, paragraph.getTopBorder(), "top" );
 
-       if (paragraph.pageBreakBefore()) {
-           block.setAttribute("break-before", "page");
-       }
+        if ( paragraph.pageBreakBefore() )
+        {
+            block.setAttribute( "break-before", "page" );
+        }
 
-       block.setAttribute("hyphenate",
-               String.valueOf(paragraph.isAutoHyphenated()));
+        block.setAttribute( "hyphenate",
+                String.valueOf( paragraph.isAutoHyphenated() ) );
 
-       if (paragraph.keepOnPage()) {
-           block.setAttribute("keep-together.within-page", "always");
-       }
+        if ( paragraph.keepOnPage() )
+        {
+            block.setAttribute( "keep-together.within-page", "always" );
+        }
 
-       if (paragraph.keepWithNext()) {
-           block.setAttribute("keep-with-next.within-page", "always");
-       }
+        if ( paragraph.keepWithNext() )
+        {
+            block.setAttribute( "keep-with-next.within-page", "always" );
+        }
 
-       block.setAttribute("linefeed-treatment", "preserve");
-       block.setAttribute("white-space-collapse", "false");
+        block.setAttribute( "linefeed-treatment", "preserve" );
+        block.setAttribute( "white-space-collapse", "false" );
     }
 
-    public static void setPictureProperties(Picture picture,
-            Element graphicElement) {
+    public static void setPictureProperties( Picture picture,
+            Element graphicElement )
+    {
         final int aspectRatioX = picture.getAspectRatioX();
         final int aspectRatioY = picture.getAspectRatioY();
 
-        if (aspectRatioX > 0) {
-            graphicElement.setAttribute("content-width", ((picture.getDxaGoal()
-                    * aspectRatioX / 100) / WordToFoUtils.TWIPS_PER_PT)
-                    + "pt");
-        } else
-            graphicElement.setAttribute("content-width",
-                    (picture.getDxaGoal() / WordToFoUtils.TWIPS_PER_PT) + "pt");
+        if ( aspectRatioX > 0 )
+        {
+            graphicElement
+                    .setAttribute( "content-width", ( ( picture.getDxaGoal()
+                            * aspectRatioX / 100 ) / TWIPS_PER_PT )
+                            + "pt" );
+        }
+        else
+            graphicElement.setAttribute( "content-width",
+                    ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" );
 
-        if (aspectRatioY > 0)
+        if ( aspectRatioY > 0 )
             graphicElement
-                    .setAttribute("content-height", ((picture.getDyaGoal()
-                            * aspectRatioY / 100) / WordToFoUtils.TWIPS_PER_PT)
-                            + "pt");
+                    .setAttribute( "content-height", ( ( picture.getDyaGoal()
+                            * aspectRatioY / 100 ) / TWIPS_PER_PT )
+                            + "pt" );
         else
-            graphicElement.setAttribute("content-height",
-                    (picture.getDyaGoal() / WordToFoUtils.TWIPS_PER_PT) + "pt");
+            graphicElement.setAttribute( "content-height",
+                    ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" );
 
-        if (aspectRatioX <= 0 || aspectRatioY <= 0) {
-            graphicElement.setAttribute("scaling", "uniform");
-        } else {
-            graphicElement.setAttribute("scaling", "non-uniform");
+        if ( aspectRatioX <= 0 || aspectRatioY <= 0 )
+        {
+            graphicElement.setAttribute( "scaling", "uniform" );
+        }
+        else
+        {
+            graphicElement.setAttribute( "scaling", "non-uniform" );
         }
 
-        graphicElement.setAttribute("vertical-align", "text-bottom");
+        graphicElement.setAttribute( "vertical-align", "text-bottom" );
 
-        if (picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
+        if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
                 || picture.getDyaCropBottom() != 0
-                || picture.getDxaCropLeft() != 0) {
-            int rectTop = picture.getDyaCropTop() / WordToFoUtils.TWIPS_PER_PT;
-            int rectRight = picture.getDxaCropRight()
-                    / WordToFoUtils.TWIPS_PER_PT;
-            int rectBottom = picture.getDyaCropBottom()
-                    / WordToFoUtils.TWIPS_PER_PT;
-            int rectLeft = picture.getDxaCropLeft()
-                    / WordToFoUtils.TWIPS_PER_PT;
-            graphicElement.setAttribute("clip", "rect(" + rectTop + "pt, "
+                || picture.getDxaCropLeft() != 0 )
+        {
+            int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT;
+            int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT;
+            int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT;
+            int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT;
+            graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, "
                     + rectRight + "pt, " + rectBottom + "pt, " + rectLeft
-                    + "pt)");
-            graphicElement.setAttribute("oveerflow", "hidden");
+                    + "pt)" );
+            graphicElement.setAttribute( "oveerflow", "hidden" );
         }
     }
 
-    public static void setTableCellProperties(TableRow tableRow,
-           TableCell tableCell, Element element, boolean toppest,
-           boolean bottomest, boolean leftest, boolean rightest) {
-       element.setAttribute("width", (tableCell.getWidth() / TWIPS_PER_INCH)
-               + "in");
-       element.setAttribute("padding-start",
-               (tableRow.getGapHalf() / TWIPS_PER_INCH) + "in");
-       element.setAttribute("padding-end",
-               (tableRow.getGapHalf() / TWIPS_PER_INCH) + "in");
-
-       BorderCode top = tableCell.getBrcTop() != null ? tableCell.getBrcTop()
-               : toppest ? tableRow.getTopBorder() : tableRow
-                       .getHorizontalBorder();
-       BorderCode bottom = tableCell.getBrcBottom() != null ? tableCell
-               .getBrcBottom() : bottomest ? tableRow.getBottomBorder()
-               : tableRow.getHorizontalBorder();
-
-       BorderCode left = tableCell.getBrcLeft() != null ? tableCell
-               .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
-               .getVerticalBorder();
-       BorderCode right = tableCell.getBrcRight() != null ? tableCell
-               .getBrcRight() : rightest ? tableRow.getRightBorder()
-               : tableRow.getVerticalBorder();
-
-       setBorder(element, bottom, "bottom");
-       setBorder(element, left, "left");
-       setBorder(element, right, "right");
-       setBorder(element, top, "top");
+    public static void setTableCellProperties( TableRow tableRow,
+            TableCell tableCell, Element element, boolean toppest,
+            boolean bottomest, boolean leftest, boolean rightest )
+    {
+        element.setAttribute( "width", ( tableCell.getWidth() / TWIPS_PER_INCH )
+                + "in" );
+        element.setAttribute( "padding-start",
+                ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" );
+        element.setAttribute( "padding-end",
+                ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" );
+
+        BorderCode top = tableCell.getBrcTop() != null
+                && tableCell.getBrcTop().getBorderType() != 0 ? tableCell
+                .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow
+                .getHorizontalBorder();
+        BorderCode bottom = tableCell.getBrcBottom() != null
+                && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell
+                .getBrcBottom() : bottomest ? tableRow.getBottomBorder()
+                : tableRow.getHorizontalBorder();
+
+        BorderCode left = tableCell.getBrcLeft() != null
+                && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell
+                .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
+                .getVerticalBorder();
+        BorderCode right = tableCell.getBrcRight() != null
+                && tableCell.getBrcRight().getBorderType() != 0 ? tableCell
+                .getBrcRight() : rightest ? tableRow.getRightBorder()
+                : tableRow.getVerticalBorder();
+
+        setBorder( element, bottom, "bottom" );
+        setBorder( element, left, "left" );
+        setBorder( element, right, "right" );
+        setBorder( element, top, "top" );
     }
 
-    public static void setTableRowProperties(TableRow tableRow,
-           Element tableRowElement) {
-       if (tableRow.getRowHeight() > 0) {
-           tableRowElement.setAttribute("height",
-                   (tableRow.getRowHeight() / TWIPS_PER_INCH) + "in");
-       }
-       if (!tableRow.cantSplit()) {
-           tableRowElement.setAttribute("keep-together", "always");
-       }
+    public static void setTableRowProperties( TableRow tableRow,
+            Element tableRowElement )
+    {
+        if ( tableRow.getRowHeight() > 0 )
+        {
+            tableRowElement.setAttribute( "height",
+                    ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in" );
+        }
+        if ( !tableRow.cantSplit() )
+        {
+            tableRowElement.setAttribute( "keep-together", "always" );
+        }
     }
 
 }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java
new file mode 100644 (file)
index 0000000..6f27e44
--- /dev/null
@@ -0,0 +1,475 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.util.List;
+import java.util.Stack;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.SectionProperties;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+import static org.apache.poi.hwpf.extractor.AbstractWordUtils.TWIPS_PER_INCH;
+
+/**
+ * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
+ */
+public class WordToHtmlExtractor extends AbstractWordExtractor
+{
+
+    /**
+     * Holds properties values, applied to current <tt>p</tt> element. Those
+     * properties shall not be doubled in children <tt>span</tt> elements.
+     */
+    private static class BlockProperies
+    {
+        final String pFontName;
+        final int pFontSize;
+
+        public BlockProperies( String pFontName, int pFontSize )
+        {
+            this.pFontName = pFontName;
+            this.pFontSize = pFontSize;
+        }
+    }
+
+    private static final POILogger logger = POILogFactory
+            .getLogger( WordToHtmlExtractor.class );
+
+    private static String getSectionStyle( Section section )
+    {
+        SectionProperties sep = WordToHtmlUtils.getSectionProperties( section );
+
+        float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
+        float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH;
+        float topMargin = sep.getDyaTop() / TWIPS_PER_INCH;
+        float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH;
+
+        String style = "margin: " + topMargin + "in " + rightMargin + "in "
+                + bottomMargin + "in " + leftMargin + "in; ";
+
+        if ( sep.getCcolM1() > 0 )
+        {
+            style += "column-count: " + ( sep.getCcolM1() + 1 ) + "; ";
+            if ( sep.getFEvenlySpaced() )
+            {
+                style += "column-gap: "
+                        + ( sep.getDxaColumns() / TWIPS_PER_INCH ) + "in; ";
+            }
+            else
+            {
+                style += "column-gap: 0.25in; ";
+            }
+        }
+        return style;
+    }
+
+    /**
+     * Java main() interface to interact with WordToHtmlExtractor
+     * 
+     * <p>
+     * Usage: WordToHtmlExtractor infile outfile
+     * </p>
+     * Where infile is an input .doc file ( Word 95-2007) which will be rendered
+     * as HTML into outfile
+     */
+    public static void main( String[] args )
+    {
+        if ( args.length < 2 )
+        {
+            System.err
+                    .println( "Usage: WordToHtmlExtractor <inputFile.doc> <saveTo.html>" );
+            return;
+        }
+
+        System.out.println( "Converting " + args[0] );
+        System.out.println( "Saving output to " + args[1] );
+        try
+        {
+            Document doc = WordToHtmlExtractor.process( new File( args[0] ) );
+
+            FileWriter out = new FileWriter( args[1] );
+            DOMSource domSource = new DOMSource( doc );
+            StreamResult streamResult = new StreamResult( out );
+
+            TransformerFactory tf = TransformerFactory.newInstance();
+            Transformer serializer = tf.newTransformer();
+            // TODO set encoding from a command argument
+            serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+            serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
+            serializer.setOutputProperty( OutputKeys.METHOD, "html" );
+            serializer.transform( domSource, streamResult );
+            out.close();
+        }
+        catch ( Exception e )
+        {
+            e.printStackTrace();
+        }
+    }
+
+    static Document process( File docFile ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile );
+        WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToHtmlExtractor.processDocument( wordDocument );
+        return wordToHtmlExtractor.getDocument();
+    }
+
+    private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
+
+    private final HtmlDocumentFacade htmlDocumentFacade;
+
+    /**
+     * Creates new instance of {@link WordToHtmlExtractor}. Can be used for
+     * output several {@link HWPFDocument}s into single HTML document.
+     * 
+     * @param document
+     *            XML DOM Document used as HTML document
+     */
+    public WordToHtmlExtractor( Document document )
+    {
+        this.htmlDocumentFacade = new HtmlDocumentFacade( document );
+    }
+
+    public Document getDocument()
+    {
+        return htmlDocumentFacade.getDocument();
+    }
+
+    @Override
+    protected void outputCharacters( Element pElement,
+            CharacterRun characterRun, String text )
+    {
+        Element span = htmlDocumentFacade.document.createElement( "span" );
+        pElement.appendChild( span );
+
+        StringBuilder style = new StringBuilder();
+        BlockProperies blockProperies = this.blocksProperies.peek();
+        if ( characterRun.getFontName() != null
+                && !WordToHtmlUtils.equals( characterRun.getFontName(),
+                        blockProperies.pFontName ) )
+        {
+            style.append( "font-family: " + characterRun.getFontName() + "; " );
+        }
+        if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
+        {
+            style.append( "font-size: " + characterRun.getFontSize() / 2 + "; " );
+        }
+
+        WordToHtmlUtils.addCharactersProperties( characterRun, style );
+        if ( style.length() != 0 )
+            span.setAttribute( "style", style.toString() );
+
+        Text textNode = htmlDocumentFacade.createText( text );
+        span.appendChild( textNode );
+    }
+
+    protected void processHyperlink( HWPFDocumentCore wordDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
+            String hyperlink, int beginTextInclusive, int endTextExclusive )
+    {
+        Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink );
+        currentBlock.appendChild( basicLink );
+
+        if ( beginTextInclusive < endTextExclusive )
+            processCharacters( wordDocument, currentTableLevel, paragraph,
+                    basicLink, characterRuns, beginTextInclusive,
+                    endTextExclusive );
+    }
+
+    /**
+     * This method shall store image bytes in external file and convert it if
+     * necessary. Images shall be stored using PNG format. Other formats may be
+     * not supported by user browser.
+     * <p>
+     * Please note the
+     * {@link WordToHtmlUtils#setPictureProperties(Picture, Element)} method.
+     * 
+     * @param currentBlock
+     *            currently processed HTML element, like <tt>p</tt>. Shall be
+     *            used as parent of newly created <tt>img</tt>
+     * @param inlined
+     *            if image is inlined
+     * @param picture
+     *            HWPF object, contained picture data and properties
+     */
+    protected void processImage( Element currentBlock, boolean inlined,
+            Picture picture )
+    {
+        // no default implementation -- skip
+        currentBlock.appendChild( htmlDocumentFacade.document
+                .createComment( "Image link to '"
+                        + picture.suggestFullFileName() + "' can be here" ) );
+    }
+
+    protected void processPageref( HWPFDocumentCore hwpfDocument,
+            Element currentBlock, Paragraph paragraph,
+            List<CharacterRun> characterRuns, int currentTableLevel,
+            String pageref, int beginTextInclusive, int endTextExclusive )
+    {
+        Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref );
+        currentBlock.appendChild( basicLink );
+
+        if ( beginTextInclusive < endTextExclusive )
+            processCharacters( hwpfDocument, currentTableLevel, paragraph,
+                    basicLink, characterRuns, beginTextInclusive,
+                    endTextExclusive );
+    }
+
+    protected void processParagraph( HWPFDocumentCore hwpfDocument,
+            Element parentFopElement, int currentTableLevel,
+            Paragraph paragraph, String bulletText )
+    {
+        final Element pElement = htmlDocumentFacade.createParagraph();
+        parentFopElement.appendChild( pElement );
+
+        StringBuilder style = new StringBuilder();
+        WordToHtmlUtils.addParagraphProperties( paragraph, style );
+
+        final int charRuns = paragraph.numCharacterRuns();
+
+        if ( charRuns == 0 )
+        {
+            return;
+        }
+
+        {
+            final String pFontName;
+            final int pFontSize;
+            final CharacterRun characterRun = paragraph.getCharacterRun( 0 );
+            if ( characterRun != null )
+            {
+                pFontSize = characterRun.getFontSize() / 2;
+                pFontName = characterRun.getFontName();
+                WordToHtmlUtils.addFontFamily( pFontName, style );
+                WordToHtmlUtils.addFontSize( pFontSize, style );
+            }
+            else
+            {
+                pFontSize = -1;
+                pFontName = WordToHtmlUtils.EMPTY;
+            }
+            blocksProperies.push( new BlockProperies( pFontName, pFontSize ) );
+        }
+        try
+        {
+            if ( WordToHtmlUtils.isNotEmpty( bulletText ) )
+            {
+                Text textNode = htmlDocumentFacade.createText( bulletText );
+                pElement.appendChild( textNode );
+            }
+
+            List<CharacterRun> characterRuns = WordToHtmlUtils
+                    .findCharacterRuns( paragraph );
+            processCharacters( hwpfDocument, currentTableLevel, paragraph,
+                    pElement, characterRuns, 0, characterRuns.size() );
+        }
+        finally
+        {
+            blocksProperies.pop();
+        }
+
+        if ( style.length() > 0 )
+            pElement.setAttribute( "style", style.toString() );
+
+        return;
+    }
+
+    protected void processSection( HWPFDocumentCore wordDocument,
+            Section section, int sectionCounter )
+    {
+        Element div = htmlDocumentFacade.document.createElement( "div" );
+        div.setAttribute( "style", getSectionStyle( section ) );
+        htmlDocumentFacade.body.appendChild( div );
+
+        processSectionParagraphes( wordDocument, div, section, 0 );
+    }
+
+    @Override
+    protected void processSingleSection( HWPFDocumentCore wordDocument,
+            Section section )
+    {
+        htmlDocumentFacade.body.setAttribute( "style",
+                getSectionStyle( section ) );
+
+        processSectionParagraphes( wordDocument, htmlDocumentFacade.body,
+                section, 0 );
+    }
+
+    protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+            Table table, int thisTableLevel )
+    {
+        Element tableHeader = htmlDocumentFacade.createTableHeader();
+        Element tableBody = htmlDocumentFacade.createTableBody();
+
+        final int tableRows = table.numRows();
+
+        int maxColumns = Integer.MIN_VALUE;
+        for ( int r = 0; r < tableRows; r++ )
+        {
+            maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );
+        }
+
+        for ( int r = 0; r < tableRows; r++ )
+        {
+            TableRow tableRow = table.getRow( r );
+
+            Element tableRowElement = htmlDocumentFacade.createTableRow();
+            StringBuilder tableRowStyle = new StringBuilder();
+            WordToHtmlUtils.addTableRowProperties( tableRow, tableRowStyle );
+
+            final int rowCells = tableRow.numCells();
+            for ( int c = 0; c < rowCells; c++ )
+            {
+                TableCell tableCell = tableRow.getCell( c );
+
+                if ( tableCell.isMerged() && !tableCell.isFirstMerged() )
+                    continue;
+
+                if ( tableCell.isVerticallyMerged()
+                        && !tableCell.isFirstVerticallyMerged() )
+                    continue;
+
+                Element tableCellElement;
+                if ( tableRow.isTableHeader() )
+                {
+                    tableCellElement = htmlDocumentFacade
+                            .createTableHeaderCell();
+                }
+                else
+                {
+                    tableCellElement = htmlDocumentFacade.createTableCell();
+                }
+                StringBuilder tableCellStyle = new StringBuilder();
+                WordToHtmlUtils.addTableCellProperties( tableRow, tableCell,
+                        r == 0, r == tableRows - 1, c == 0, c == rowCells - 1,
+                        tableCellStyle );
+
+                if ( tableCell.isFirstMerged() )
+                {
+                    int count = 0;
+                    for ( int c1 = c; c1 < rowCells; c1++ )
+                    {
+                        TableCell nextCell = tableRow.getCell( c1 );
+                        if ( nextCell.isMerged() )
+                            count++;
+                        if ( !nextCell.isMerged() )
+                            break;
+                    }
+                    tableCellElement.setAttribute( "colspan", "" + count );
+                }
+                else
+                {
+                    if ( c == rowCells - 1 && c != maxColumns - 1 )
+                    {
+                        tableCellElement.setAttribute( "colspan", ""
+                                + ( maxColumns - c ) );
+                    }
+                }
+
+                if ( tableCell.isFirstVerticallyMerged() )
+                {
+                    int count = 0;
+                    for ( int r1 = r; r1 < tableRows; r1++ )
+                    {
+                        TableRow nextRow = table.getRow( r1 );
+                        if ( nextRow.numCells() < c )
+                            break;
+                        TableCell nextCell = nextRow.getCell( c );
+                        if ( nextCell.isVerticallyMerged() )
+                            count++;
+                        if ( !nextCell.isVerticallyMerged() )
+                            break;
+                    }
+                    tableCellElement.setAttribute( "rowspan", "" + count );
+                }
+
+                processSectionParagraphes( hwpfDocument, tableCellElement,
+                        tableCell, thisTableLevel );
+
+                if ( !tableCellElement.hasChildNodes() )
+                {
+                    tableCellElement.appendChild( htmlDocumentFacade
+                            .createParagraph() );
+                }
+                if ( tableCellStyle.length() > 0 )
+                    tableCellElement.setAttribute( "style",
+                            tableCellStyle.toString() );
+
+                tableRowElement.appendChild( tableCellElement );
+            }
+
+            if ( tableRowStyle.length() > 0 )
+                tableRowElement
+                        .setAttribute( "style", tableRowStyle.toString() );
+
+            if ( tableRow.isTableHeader() )
+            {
+                tableHeader.appendChild( tableRowElement );
+            }
+            else
+            {
+                tableBody.appendChild( tableRowElement );
+            }
+
+        }
+
+        final Element tableElement = htmlDocumentFacade.createTable();
+        if ( tableHeader.hasChildNodes() )
+        {
+            tableElement.appendChild( tableHeader );
+        }
+        if ( tableBody.hasChildNodes() )
+        {
+            tableElement.appendChild( tableBody );
+            flow.appendChild( tableElement );
+        }
+        else
+        {
+            logger.log(
+                    POILogger.WARN,
+                    "Table without body starting on offset "
+                            + table.getStartOffset() + " -- "
+                            + table.getEndOffset() );
+        }
+    }
+
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java
new file mode 100644 (file)
index 0000000..4417f62
--- /dev/null
@@ -0,0 +1,292 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import org.apache.poi.hwpf.usermodel.BorderCode;
+import org.apache.poi.hwpf.usermodel.CharacterProperties;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.w3c.dom.Element;
+
+public class WordToHtmlUtils extends AbstractWordUtils
+{
+    public static void addBold( final boolean bold, StringBuilder style )
+    {
+        style.append( "font-weight: " + ( bold ? "bold" : "normal" ) + ";" );
+    }
+
+    public static void addBorder( BorderCode borderCode, String where,
+            StringBuilder style )
+    {
+        if ( borderCode == null || borderCode.getBorderType() == 0 )
+            return;
+
+        if ( isEmpty( where ) )
+        {
+            style.append( "border-style: " + getBorderType( borderCode ) + "; " );
+            style.append( "border-color: " + getColor( borderCode.getColor() )
+                    + "; " );
+            style.append( "border-width: " + getBorderWidth( borderCode )
+                    + "; " );
+        }
+        else
+        {
+            style.append( "border-" + where + "-style: "
+                    + getBorderType( borderCode ) + "; " );
+            style.append( "border-" + where + "-color: "
+                    + getColor( borderCode.getColor() ) + "; " );
+            style.append( "border-" + where + "-width: "
+                    + getBorderWidth( borderCode ) + "; " );
+        }
+    }
+
+    public static void addCharactersProperties(
+            final CharacterRun characterRun, StringBuilder style )
+    {
+        final CharacterProperties clonedProperties = characterRun
+                .cloneProperties();
+
+        if ( characterRun.isBold() )
+        {
+            style.append( "font-weight: bold; " );
+        }
+        if ( characterRun.isItalic() )
+        {
+            style.append( "font-style: italic; " );
+        }
+
+        addBorder( clonedProperties.getBrc(), EMPTY, style );
+
+        if ( characterRun.isCapitalized() )
+        {
+            style.append( "text-transform: uppercase; " );
+        }
+        if ( characterRun.isHighlighted() )
+        {
+            style.append( "background-color: "
+                    + getColor( clonedProperties.getIcoHighlight() ) + "; " );
+        }
+        if ( characterRun.isStrikeThrough() )
+        {
+            style.append( "text-decoration: line-through; " );
+        }
+        if ( characterRun.isShadowed() )
+        {
+            style.append( "text-shadow: " + characterRun.getFontSize() / 24
+                    + "pt; " );
+        }
+        if ( characterRun.isSmallCaps() )
+        {
+            style.append( "font-variant: small-caps; " );
+        }
+        if ( characterRun.getSubSuperScriptIndex() == 1 )
+        {
+            style.append( "baseline-shift: super; " );
+            style.append( "font-size: smaller; " );
+        }
+        if ( characterRun.getSubSuperScriptIndex() == 2 )
+        {
+            style.append( "baseline-shift: sub; " );
+            style.append( "font-size: smaller; " );
+        }
+        if ( characterRun.getUnderlineCode() > 0 )
+        {
+            style.append( "text-decoration: underline; " );
+        }
+        if ( characterRun.isVanished() )
+        {
+            style.append( "visibility: hidden; " );
+        }
+    }
+
+    public static void addFontFamily( final String fontFamily,
+            StringBuilder style )
+    {
+        if ( isEmpty( fontFamily ) )
+            return;
+
+        style.append( "font-family: " + fontFamily );
+    }
+
+    public static void addFontSize( final int fontSize, StringBuilder style )
+    {
+        style.append( "font-size: " + fontSize );
+    }
+
+    public static void addIndent( Paragraph paragraph, StringBuilder style )
+    {
+        addIndent( style, "text-indent", paragraph.getFirstLineIndent() );
+        addIndent( style, "start-indent", paragraph.getIndentFromLeft() );
+        addIndent( style, "end-indent", paragraph.getIndentFromRight() );
+        addIndent( style, "space-before", paragraph.getSpacingBefore() );
+        addIndent( style, "space-after", paragraph.getSpacingAfter() );
+    }
+
+    private static void addIndent( StringBuilder style, final String cssName,
+            final int twipsValue )
+    {
+        if ( twipsValue == 0 )
+            return;
+
+        style.append( cssName + ": " + ( twipsValue / TWIPS_PER_PT ) + "pt; " );
+    }
+
+    public static void addJustification( Paragraph paragraph,
+            final StringBuilder style )
+    {
+        String justification = getJustification( paragraph.getJustification() );
+        if ( isNotEmpty( justification ) )
+            style.append( "text-align: " + justification + "; " );
+    }
+
+    public static void addParagraphProperties( Paragraph paragraph,
+            StringBuilder style )
+    {
+        addIndent( paragraph, style );
+        addJustification( paragraph, style );
+
+        addBorder( paragraph.getBottomBorder(), "bottom", style );
+        addBorder( paragraph.getLeftBorder(), "left", style );
+        addBorder( paragraph.getRightBorder(), "right", style );
+        addBorder( paragraph.getTopBorder(), "top", style );
+
+        if ( paragraph.pageBreakBefore() )
+        {
+            style.append( "break-before: page; " );
+        }
+
+        style.append( "hyphenate: " + paragraph.isAutoHyphenated() + "; " );
+
+        if ( paragraph.keepOnPage() )
+        {
+            style.append( "keep-together.within-page: always; " );
+        }
+
+        if ( paragraph.keepWithNext() )
+        {
+            style.append( "keep-with-next.within-page: always; " );
+        }
+
+        style.append( "linefeed-treatment: preserve; " );
+        style.append( "white-space-collapse: false; " );
+    }
+
+    public static void addTableCellProperties( TableRow tableRow,
+            TableCell tableCell, boolean toppest, boolean bottomest,
+            boolean leftest, boolean rightest, StringBuilder style )
+    {
+        style.append( "width: " + ( tableCell.getWidth() / TWIPS_PER_INCH )
+                + "in; " );
+        style.append( "padding-start: "
+                + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " );
+        style.append( "padding-end: "
+                + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " );
+
+        BorderCode top = tableCell.getBrcTop() != null
+                && tableCell.getBrcTop().getBorderType() != 0 ? tableCell
+                .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow
+                .getHorizontalBorder();
+        BorderCode bottom = tableCell.getBrcBottom() != null
+                && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell
+                .getBrcBottom() : bottomest ? tableRow.getBottomBorder()
+                : tableRow.getHorizontalBorder();
+
+        BorderCode left = tableCell.getBrcLeft() != null
+                && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell
+                .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
+                .getVerticalBorder();
+        BorderCode right = tableCell.getBrcRight() != null
+                && tableCell.getBrcRight().getBorderType() != 0 ? tableCell
+                .getBrcRight() : rightest ? tableRow.getRightBorder()
+                : tableRow.getVerticalBorder();
+
+        addBorder( bottom, "bottom", style );
+        addBorder( left, "left", style );
+        addBorder( right, "right", style );
+        addBorder( top, "top", style );
+    }
+
+    public static void addTableRowProperties( TableRow tableRow,
+            StringBuilder style )
+    {
+        if ( tableRow.getRowHeight() > 0 )
+        {
+            style.append( "height: "
+                    + ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in; " );
+        }
+        if ( !tableRow.cantSplit() )
+        {
+            style.append( "keep-together: always; " );
+        }
+    }
+
+    public static void setPictureProperties( Picture picture,
+            Element graphicElement )
+    {
+        final int aspectRatioX = picture.getAspectRatioX();
+        final int aspectRatioY = picture.getAspectRatioY();
+
+        if ( aspectRatioX > 0 )
+        {
+            graphicElement
+                    .setAttribute( "content-width", ( ( picture.getDxaGoal()
+                            * aspectRatioX / 100 ) / TWIPS_PER_PT )
+                            + "pt" );
+        }
+        else
+            graphicElement.setAttribute( "content-width",
+                    ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" );
+
+        if ( aspectRatioY > 0 )
+            graphicElement
+                    .setAttribute( "content-height", ( ( picture.getDyaGoal()
+                            * aspectRatioY / 100 ) / TWIPS_PER_PT )
+                            + "pt" );
+        else
+            graphicElement.setAttribute( "content-height",
+                    ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" );
+
+        if ( aspectRatioX <= 0 || aspectRatioY <= 0 )
+        {
+            graphicElement.setAttribute( "scaling", "uniform" );
+        }
+        else
+        {
+            graphicElement.setAttribute( "scaling", "non-uniform" );
+        }
+
+        graphicElement.setAttribute( "vertical-align", "text-bottom" );
+
+        if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
+                || picture.getDyaCropBottom() != 0
+                || picture.getDxaCropLeft() != 0 )
+        {
+            int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT;
+            int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT;
+            int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT;
+            int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT;
+            graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, "
+                    + rectRight + "pt, " + rectBottom + "pt, " + rectLeft
+                    + "pt)" );
+            graphicElement.setAttribute( "oveerflow", "hidden" );
+        }
+    }
+
+}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java
new file mode 100644 (file)
index 0000000..62cfb99
--- /dev/null
@@ -0,0 +1,114 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.StringWriter;
+import java.util.Arrays;
+import java.util.List;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+
+public class TestWordToExtractorSuite
+{
+    /**
+     * YK: a quick hack to exclude failing documents from the suite.
+     */
+    private static List<String> failingFiles = Arrays.asList();
+
+    public static Test suite()
+    {
+        TestSuite suite = new TestSuite();
+
+        File directory = POIDataSamples.getDocumentInstance().getFile(
+                "../document" );
+        for ( final File child : directory.listFiles( new FilenameFilter()
+        {
+            public boolean accept( File dir, String name )
+            {
+                return name.endsWith( ".doc" ) && !failingFiles.contains( name );
+            }
+        } ) )
+        {
+            final String name = child.getName();
+
+            suite.addTest( new TestCase( name + " [FO]" )
+            {
+                public void runTest() throws Exception
+                {
+                    test( child, false );
+                }
+            } );
+            suite.addTest( new TestCase( name + " [HTML]" )
+            {
+                public void runTest() throws Exception
+                {
+                    test( child, true );
+                }
+            } );
+
+        }
+
+        return suite;
+    }
+
+    protected static void test( File child, boolean html ) throws Exception
+    {
+        HWPFDocumentCore hwpfDocument;
+        try
+        {
+            hwpfDocument = AbstractWordUtils.loadDoc( child );
+        }
+        catch ( Exception exc )
+        {
+            // unable to parse file -- not WordToFoExtractor fault
+            return;
+        }
+
+        WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToFoExtractor.processDocument( hwpfDocument );
+
+        StringWriter stringWriter = new StringWriter();
+
+        Transformer transformer = TransformerFactory.newInstance()
+                .newTransformer();
+        transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
+        transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
+        transformer.transform(
+                new DOMSource( wordToFoExtractor.getDocument() ),
+                new StreamResult( stringWriter ) );
+
+        if ( html )
+            transformer.setOutputProperty( OutputKeys.METHOD, "html" );
+
+        // no exceptions
+    }
+}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractorSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractorSuite.java
deleted file mode 100644 (file)
index 4844cf6..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-package org.apache.poi.hwpf.extractor;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FilenameFilter;
-import java.io.StringWriter;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-
-import org.apache.poi.EncryptedDocumentException;
-
-import org.apache.poi.hwpf.OldWordFileFormatException;
-
-import junit.framework.Test;
-import junit.framework.TestCase;
-import junit.framework.TestSuite;
-import org.apache.poi.POIDataSamples;
-import org.apache.poi.hwpf.HWPFDocument;
-
-public class TestWordToFoExtractorSuite
-{
-    /**
-     * YK: a quick hack to exclude failing documents from the suite.
-     *
-     * WordToFoExtractor stumbles on Bug33519.doc with a NPE
-     */
-    private static List<String> failingFiles = Arrays.asList("Bug33519.doc");
-
-    public static Test suite() {
-        TestSuite suite = new TestSuite();
-
-        File directory = POIDataSamples.getDocumentInstance().getFile(
-                "../document");
-        for (final File child : directory.listFiles(new FilenameFilter() {
-            public boolean accept(File dir, String name) {
-                return name.endsWith(".doc") && !failingFiles.contains(name);
-            }
-        })) {
-            final String name = child.getName();
-            suite.addTest(new TestCase(name) {
-                public void runTest() throws Exception {
-                    test(child);
-                }
-            });
-        }
-
-        return suite;
-    }
-
-    protected static void test( File child ) throws Exception
-    {
-        HWPFDocument hwpfDocument;
-        FileInputStream fileInputStream = new FileInputStream( child );
-        try
-        {
-            hwpfDocument = new HWPFDocument( fileInputStream );
-        }
-        catch ( Exception exc )
-        {
-            // unable to parse file -- not WordToFoExtractor fault
-            return;
-        }
-        finally
-        {
-            fileInputStream.close();
-        }
-
-        WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
-                DocumentBuilderFactory.newInstance().newDocumentBuilder()
-                        .newDocument() );
-        wordToFoExtractor.processDocument( hwpfDocument );
-
-        StringWriter stringWriter = new StringWriter();
-
-        Transformer transformer = TransformerFactory.newInstance()
-                .newTransformer();
-        transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
-        transformer.transform(
-                new DOMSource( wordToFoExtractor.getDocument() ),
-                new StreamResult( stringWriter ) );
-        // no exceptions
-    }
-}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java
new file mode 100644 (file)
index 0000000..f758e6f
--- /dev/null
@@ -0,0 +1,95 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.StringWriter;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import junit.framework.TestCase;
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.hwpf.HWPFDocument;
+
+/**
+ * Test cases for {@link WordToFoExtractor}
+ * 
+ * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
+ */
+public class TestWordToHtmlExtractor extends TestCase
+{
+    private static String getHtmlText( final String sampleFileName )
+            throws Exception
+    {
+        HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples
+                .getDocumentInstance().openResourceAsStream( sampleFileName ) );
+
+        WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToHtmlExtractor.processDocument( hwpfDocument );
+
+        StringWriter stringWriter = new StringWriter();
+
+        Transformer transformer = TransformerFactory.newInstance()
+                .newTransformer();
+        transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
+        transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
+        transformer.setOutputProperty( OutputKeys.METHOD, "html" );
+        transformer.transform(
+                new DOMSource( wordToHtmlExtractor.getDocument() ),
+                new StreamResult( stringWriter ) );
+
+        String result = stringWriter.toString();
+        return result;
+    }
+
+    public void testBug46610_2() throws Exception
+    {
+        String result = getHtmlText( "Bug46610_2.doc" );
+        assertTrue( result
+                .contains( "012345678911234567892123456789312345678941234567890123456789112345678921234567893123456789412345678" ) );
+    }
+
+    public void testEquation() throws Exception
+    {
+        String result = getHtmlText( "equation.doc" );
+
+        assertTrue( result
+                .contains( "<!--Image link to '0.emf' can be here-->" ) );
+    }
+
+    public void testHyperlink() throws Exception
+    {
+        String result = getHtmlText( "hyperlink.doc" );
+
+        assertTrue( result.contains( "<a href=\"http://testuri.org/\">" ) );
+        assertTrue( result.contains( "Hyperlink text" ) );
+    }
+
+    public void testPageref() throws Exception
+    {
+        String result = getHtmlText( "pageref.doc" );
+
+        assertTrue( result.contains( "<a href=\"#userref\">" ) );
+        assertTrue( result.contains( "1" ) );
+    }
+}