+++ /dev/null
-/*
- * ====================================================================
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ====================================================================
- */
-package org.apache.poi.hwpf.extractor;
-
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Text;
-
-public abstract class AbstractToFoExtractor
-{
-
- private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
-
- protected final Document document;
- protected final Element layoutMasterSet;
- protected final Element root;
-
- public AbstractToFoExtractor( Document document )
- {
- this.document = document;
-
- root = document.createElementNS( NS_XSLFO, "fo:root" );
- document.appendChild( root );
-
- layoutMasterSet = document.createElementNS( NS_XSLFO,
- "fo:layout-master-set" );
- root.appendChild( layoutMasterSet );
- }
-
- protected Element addFlowToPageSequence( final Element pageSequence,
- String flowName )
- {
- final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" );
- flow.setAttribute( "flow-name", flowName );
- pageSequence.appendChild( flow );
-
- return flow;
- }
-
- protected Element addListItem( Element listBlock )
- {
- Element result = createListItem();
- listBlock.appendChild( result );
- return result;
- }
-
- protected Element addListItemBody( Element listItem )
- {
- Element result = createListItemBody();
- listItem.appendChild( result );
- return result;
- }
-
- protected Element addListItemLabel( Element listItem, String text )
- {
- Element result = createListItemLabel( text );
- listItem.appendChild( result );
- return result;
- }
-
- protected Element addPageSequence( String pageMaster )
- {
- final Element pageSequence = document.createElementNS( NS_XSLFO,
- "fo:page-sequence" );
- pageSequence.setAttribute( "master-reference", pageMaster );
- root.appendChild( pageSequence );
- return pageSequence;
- }
-
- protected Element addRegionBody( Element pageMaster )
- {
- final Element regionBody = document.createElementNS( NS_XSLFO,
- "fo:region-body" );
- pageMaster.appendChild( regionBody );
-
- return regionBody;
- }
-
- protected Element addSimplePageMaster( String masterName )
- {
- final Element simplePageMaster = document.createElementNS( NS_XSLFO,
- "fo:simple-page-master" );
- simplePageMaster.setAttribute( "master-name", masterName );
- layoutMasterSet.appendChild( simplePageMaster );
-
- return simplePageMaster;
- }
-
- protected Element createBasicLinkExternal( String externalDestination )
- {
- final Element basicLink = document.createElementNS( NS_XSLFO,
- "fo:basic-link" );
- basicLink.setAttribute( "external-destination", externalDestination );
- return basicLink;
- }
-
- protected Element createBasicLinkInternal( String internalDestination )
- {
- final Element basicLink = document.createElementNS( NS_XSLFO,
- "fo:basic-link" );
- basicLink.setAttribute( "internal-destination", internalDestination );
- return basicLink;
- }
-
- protected Element createBlock()
- {
- return document.createElementNS( NS_XSLFO, "fo:block" );
- }
-
- protected Element createExternalGraphic( String source )
- {
- Element result = document.createElementNS( NS_XSLFO,
- "fo:external-graphic" );
- result.setAttribute( "src", "url('" + source + "')" );
- return result;
- }
-
- protected Element createInline()
- {
- return document.createElementNS( NS_XSLFO, "fo:inline" );
- }
-
- protected Element createLeader()
- {
- return document.createElementNS( NS_XSLFO, "fo:leader" );
- }
-
- protected Element createListBlock()
- {
- return document.createElementNS( NS_XSLFO, "fo:list-block" );
- }
-
- protected Element createListItem()
- {
- return document.createElementNS( NS_XSLFO, "fo:list-item" );
- }
-
- protected Element createListItemBody()
- {
- return document.createElementNS( NS_XSLFO, "fo:list-item-body" );
- }
-
- protected Element createListItemLabel( String text )
- {
- Element result = document.createElementNS( NS_XSLFO,
- "fo:list-item-label" );
- Element block = createBlock();
- block.appendChild( document.createTextNode( text ) );
- result.appendChild( block );
- return result;
- }
-
- protected Element createTable()
- {
- return document.createElementNS( NS_XSLFO, "fo:table" );
- }
-
- protected Element createTableBody()
- {
- return document.createElementNS( NS_XSLFO, "fo:table-body" );
- }
-
- protected Element createTableCell()
- {
- return document.createElementNS( NS_XSLFO, "fo:table-cell" );
- }
-
- protected Element createTableHeader()
- {
- return document.createElementNS( NS_XSLFO, "fo:table-header" );
- }
-
- protected Element createTableRow()
- {
- return document.createElementNS( NS_XSLFO, "fo:table-row" );
- }
-
- protected Text createText( String data )
- {
- return document.createTextNode( data );
- }
-
- public Document getDocument()
- {
- return document;
- }
-
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.model.ListFormatOverride;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableIterator;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+
+public abstract class AbstractWordExtractor
+{
+ private static final byte BEL_MARK = 7;
+
+ private static final byte FIELD_BEGIN_MARK = 19;
+
+ private static final byte FIELD_END_MARK = 21;
+
+ private static final byte FIELD_SEPARATOR_MARK = 20;
+
+ private static final POILogger logger = POILogFactory
+ .getLogger( AbstractWordExtractor.class );
+
+ public abstract Document getDocument();
+
+ protected abstract void outputCharacters( Element block,
+ CharacterRun characterRun, String text );
+
+ protected boolean processCharacters( HWPFDocumentCore hwpfDocument,
+ int currentTableLevel, Paragraph paragraph, final Element block,
+ List<CharacterRun> characterRuns, final int start, final int end )
+ {
+ boolean haveAnyText = false;
+
+ for ( int c = start; c < end; c++ )
+ {
+ CharacterRun characterRun = characterRuns.get( c );
+
+ if ( characterRun == null )
+ throw new AssertionError();
+
+ if ( hwpfDocument instanceof HWPFDocument
+ && ( (HWPFDocument) hwpfDocument ).getPicturesTable()
+ .hasPicture( characterRun ) )
+ {
+ HWPFDocument newFormat = (HWPFDocument) hwpfDocument;
+ Picture picture = newFormat.getPicturesTable().extractPicture(
+ characterRun, true );
+
+ processImage( block, characterRun.text().charAt( 0 ) == 0x01,
+ picture );
+ continue;
+ }
+
+ String text = characterRun.text();
+ if ( text.getBytes().length == 0 )
+ continue;
+
+ if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
+ {
+ int skipTo = tryField( hwpfDocument, paragraph,
+ currentTableLevel, characterRuns, c, block );
+
+ if ( skipTo != c )
+ {
+ c = skipTo;
+ continue;
+ }
+
+ continue;
+ }
+ if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
+ {
+ // shall not appear without FIELD_BEGIN_MARK
+ continue;
+ }
+ if ( text.getBytes()[0] == FIELD_END_MARK )
+ {
+ // shall not appear without FIELD_BEGIN_MARK
+ continue;
+ }
+
+ if ( characterRun.isSpecialCharacter() || characterRun.isObj()
+ || characterRun.isOle2() )
+ {
+ continue;
+ }
+
+ if ( text.endsWith( "\r" )
+ || ( text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0 ) )
+ text = text.substring( 0, text.length() - 1 );
+
+ outputCharacters( block, characterRun, text );
+
+ haveAnyText |= text.trim().length() != 0;
+ }
+
+ return haveAnyText;
+ }
+
+ public void processDocument( HWPFDocumentCore wordDocument )
+ {
+ final Range range = wordDocument.getRange();
+ for ( int s = 0; s < range.numSections(); s++ )
+ {
+ processSection( wordDocument, range.getSection( s ), s );
+ }
+ }
+
+ protected void processField( HWPFDocumentCore wordDocument,
+ Element currentBlock, Paragraph paragraph, int currentTableLevel,
+ List<CharacterRun> characterRuns, int beginMark, int separatorMark,
+ int endMark )
+ {
+
+ Pattern hyperlinkPattern = Pattern
+ .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
+ Pattern pagerefPattern = Pattern
+ .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
+
+ if ( separatorMark - beginMark > 1 )
+ {
+ int index = beginMark + 1;
+ CharacterRun firstAfterBegin = null;
+ while ( index < separatorMark )
+ {
+ firstAfterBegin = paragraph.getCharacterRun( index );
+ if ( firstAfterBegin == null )
+ {
+ logger.log( POILogger.WARN,
+ "Paragraph " + paragraph.getStartOffset() + "--"
+ + paragraph.getEndOffset()
+ + " contains null CharacterRun #" + index );
+ index++;
+ continue;
+ }
+ break;
+ }
+
+ if ( firstAfterBegin != null )
+ {
+ final Matcher hyperlinkMatcher = hyperlinkPattern
+ .matcher( firstAfterBegin.text() );
+ if ( hyperlinkMatcher.matches() )
+ {
+ String hyperlink = hyperlinkMatcher.group( 1 );
+ processHyperlink( wordDocument, currentBlock, paragraph,
+ characterRuns, currentTableLevel, hyperlink,
+ separatorMark + 1, endMark );
+ return;
+ }
+
+ final Matcher pagerefMatcher = pagerefPattern
+ .matcher( firstAfterBegin.text() );
+ if ( pagerefMatcher.matches() )
+ {
+ String pageref = pagerefMatcher.group( 1 );
+ processPageref( wordDocument, currentBlock, paragraph,
+ characterRuns, currentTableLevel, pageref,
+ separatorMark + 1, endMark );
+ return;
+ }
+ }
+ }
+
+ StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
+ for ( int i = beginMark; i <= endMark; i++ )
+ {
+ debug.append( "\t" );
+ debug.append( paragraph.getCharacterRun( i ) );
+ debug.append( "\n" );
+ }
+ logger.log( POILogger.WARN, debug );
+
+ // just output field value
+ if ( separatorMark + 1 < endMark )
+ processCharacters( wordDocument, currentTableLevel, paragraph,
+ currentBlock, characterRuns, separatorMark + 1, endMark );
+
+ return;
+ }
+
+ protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
+ Element currentBlock, Paragraph paragraph,
+ List<CharacterRun> characterRuns, int currentTableLevel,
+ String hyperlink, int i, int endMark );
+
+ protected abstract void processImage( Element currentBlock,
+ boolean inlined, Picture picture );
+
+ protected abstract void processPageref( HWPFDocumentCore wordDocument,
+ Element currentBlock, Paragraph paragraph,
+ List<CharacterRun> characterRuns, int currentTableLevel,
+ String pageref, int beginTextInclusive, int endTextExclusive );
+
+ protected abstract void processParagraph( HWPFDocumentCore wordDocument,
+ Element parentFopElement, int currentTableLevel,
+ Paragraph paragraph, String bulletText );
+
+ protected abstract void processSection( HWPFDocumentCore wordDocument,
+ Section section, int s );
+
+ protected void processSectionParagraphes( HWPFDocumentCore wordDocument,
+ Element flow, Range range, int currentTableLevel )
+ {
+ final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
+ for ( TableIterator tableIterator = AbstractWordUtils.newTableIterator(
+ range, currentTableLevel + 1 ); tableIterator.hasNext(); )
+ {
+ Table next = tableIterator.next();
+ allTables.put( Integer.valueOf( next.getStartOffset() ), next );
+ }
+
+ final ListTables listTables = wordDocument.getListTables();
+ int currentListInfo = 0;
+
+ final int paragraphs = range.numParagraphs();
+ for ( int p = 0; p < paragraphs; p++ )
+ {
+ Paragraph paragraph = range.getParagraph( p );
+
+ if ( allTables.containsKey( Integer.valueOf( paragraph
+ .getStartOffset() ) ) )
+ {
+ Table table = allTables.get( Integer.valueOf( paragraph
+ .getStartOffset() ) );
+ processTable( wordDocument, flow, table, currentTableLevel + 1 );
+ continue;
+ }
+
+ if ( paragraph.isInTable()
+ && paragraph.getTableLevel() != currentTableLevel )
+ {
+ continue;
+ }
+
+ if ( paragraph.getIlfo() != currentListInfo )
+ {
+ currentListInfo = paragraph.getIlfo();
+ }
+
+ if ( currentListInfo != 0 )
+ {
+ if ( listTables != null )
+ {
+ final ListFormatOverride listFormatOverride = listTables
+ .getOverride( paragraph.getIlfo() );
+
+ String label = AbstractWordUtils.getBulletText( listTables,
+ paragraph, listFormatOverride.getLsid() );
+
+ processParagraph( wordDocument, flow, currentTableLevel,
+ paragraph, label );
+ }
+ else
+ {
+ logger.log( POILogger.WARN,
+ "Paragraph #" + paragraph.getStartOffset() + "-"
+ + paragraph.getEndOffset()
+ + " has reference to list structure #"
+ + currentListInfo
+ + ", but listTables not defined in file" );
+
+ processParagraph( wordDocument, flow, currentTableLevel,
+ paragraph, AbstractWordUtils.EMPTY );
+ }
+ }
+ else
+ {
+ processParagraph( wordDocument, flow, currentTableLevel,
+ paragraph, AbstractWordUtils.EMPTY );
+ }
+ }
+
+ }
+
+ protected void processSingleSection( HWPFDocumentCore wordDocument,
+ Section section )
+ {
+ processSection( wordDocument, section, 0 );
+ }
+
+ protected abstract void processTable( HWPFDocumentCore wordDocument,
+ Element flow, Table table, int newTableLevel );
+
+ protected int tryField( HWPFDocumentCore wordDocument, Paragraph paragraph,
+ int currentTableLevel, List<CharacterRun> characterRuns,
+ int beginMark, Element currentBlock )
+ {
+ int separatorMark = -1;
+ int endMark = -1;
+ for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
+ {
+ CharacterRun characterRun = paragraph.getCharacterRun( c );
+
+ String text = characterRun.text();
+ if ( text.getBytes().length == 0 )
+ continue;
+
+ if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
+ {
+ if ( separatorMark != -1 )
+ {
+ // double;
+ return beginMark;
+ }
+
+ separatorMark = c;
+ continue;
+ }
+
+ if ( text.getBytes()[0] == FIELD_END_MARK )
+ {
+ if ( endMark != -1 )
+ {
+ // double;
+ return beginMark;
+ }
+
+ endMark = c;
+ break;
+ }
+
+ }
+
+ if ( separatorMark == -1 || endMark == -1 )
+ return beginMark;
+
+ processField( wordDocument, currentBlock, paragraph, currentTableLevel,
+ characterRuns, beginMark, separatorMark, endMark );
+
+ return endMark;
+ }
+
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.BorderCode;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.SectionProperties;
+import org.apache.poi.hwpf.usermodel.TableIterator;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+public class AbstractWordUtils
+{
+ static final String EMPTY = "";
+
+ private static final POILogger logger = POILogFactory
+ .getLogger( AbstractWordUtils.class );
+
+ public static final float TWIPS_PER_INCH = 1440.0f;
+ public static final int TWIPS_PER_PT = 20;
+
+ static void closeQuietly( final Closeable closeable )
+ {
+ try
+ {
+ closeable.close();
+ }
+ catch ( Exception exc )
+ {
+ logger.log( POILogger.ERROR, "Unable to close resource: " + exc,
+ exc );
+ }
+ }
+
+ static boolean equals( String str1, String str2 )
+ {
+ return str1 == null ? str2 == null : str1.equals( str2 );
+ }
+
+ // XXX incorporate into Range
+ static List<CharacterRun> findCharacterRuns( Range range )
+ {
+ final int min = range.getStartOffset();
+ final int max = range.getEndOffset();
+
+ List<CharacterRun> result = new ArrayList<CharacterRun>();
+ List<CHPX> chpxs = getCharacters( range );
+ for ( int i = 0; i < chpxs.size(); i++ )
+ {
+ CHPX chpx = chpxs.get( i );
+ if ( chpx == null )
+ continue;
+
+ if ( Math.max( min, chpx.getStart() ) <= Math.min( max,
+ chpx.getEnd() ) )
+ {
+ final CharacterRun characterRun = getCharacterRun( range, chpx );
+
+ if ( characterRun == null )
+ continue;
+
+ result.add( characterRun );
+ }
+ }
+
+ return result;
+ }
+
+ public static String getBorderType( BorderCode borderCode )
+ {
+ if ( borderCode == null )
+ throw new IllegalArgumentException( "borderCode is null" );
+
+ switch ( borderCode.getBorderType() )
+ {
+ case 1:
+ case 2:
+ return "solid";
+ case 3:
+ return "double";
+ case 5:
+ return "solid";
+ case 6:
+ return "dotted";
+ case 7:
+ case 8:
+ return "dashed";
+ case 9:
+ return "dotted";
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ case 17:
+ case 18:
+ case 19:
+ return "double";
+ case 20:
+ return "solid";
+ case 21:
+ return "double";
+ case 22:
+ return "dashed";
+ case 23:
+ return "dashed";
+ case 24:
+ return "ridge";
+ case 25:
+ return "grooved";
+ default:
+ return "solid";
+ }
+ }
+
+ public static String getBorderWidth( BorderCode borderCode )
+ {
+ int lineWidth = borderCode.getLineWidth();
+ int pt = lineWidth / 8;
+ int pte = lineWidth - pt * 8;
+
+ StringBuilder stringBuilder = new StringBuilder();
+ stringBuilder.append( pt );
+ stringBuilder.append( "." );
+ stringBuilder.append( 1000 / 8 * pte );
+ stringBuilder.append( "pt" );
+ return stringBuilder.toString();
+ }
+
+ public static String getBulletText( ListTables listTables,
+ Paragraph paragraph, int listId )
+ {
+ final ListLevel listLevel = listTables.getLevel( listId,
+ paragraph.getIlvl() );
+
+ if ( listLevel.getNumberText() == null )
+ return EMPTY;
+
+ StringBuffer bulletBuffer = new StringBuffer();
+ char[] xst = listLevel.getNumberText().toCharArray();
+ for ( char element : xst )
+ {
+ if ( element < 9 )
+ {
+ ListLevel numLevel = listTables.getLevel( listId, element );
+
+ int num = numLevel.getStartAt();
+ bulletBuffer.append( NumberFormatter.getNumber( num,
+ listLevel.getNumberFormat() ) );
+
+ if ( numLevel == listLevel )
+ {
+ numLevel.setStartAt( numLevel.getStartAt() + 1 );
+ }
+
+ }
+ else
+ {
+ bulletBuffer.append( element );
+ }
+ }
+
+ byte follow = getIxchFollow( listLevel );
+ switch ( follow )
+ {
+ case 0:
+ bulletBuffer.append( "\t" );
+ break;
+ case 1:
+ bulletBuffer.append( " " );
+ break;
+ default:
+ break;
+ }
+
+ return bulletBuffer.toString();
+ }
+
+ private static CharacterRun getCharacterRun( Range range, CHPX chpx )
+ {
+ try
+ {
+ Method method = Range.class.getDeclaredMethod( "getCharacterRun",
+ CHPX.class );
+ method.setAccessible( true );
+ return (CharacterRun) method.invoke( range, chpx );
+ }
+ catch ( Exception exc )
+ {
+ throw new Error( exc );
+ }
+ }
+
+ private static List<CHPX> getCharacters( Range range )
+ {
+ try
+ {
+ Field field = Range.class.getDeclaredField( "_characters" );
+ field.setAccessible( true );
+ return (List<CHPX>) field.get( range );
+ }
+ catch ( Exception exc )
+ {
+ throw new Error( exc );
+ }
+ }
+
+ public static String getColor( int ico )
+ {
+ switch ( ico )
+ {
+ case 1:
+ return "black";
+ case 2:
+ return "blue";
+ case 3:
+ return "cyan";
+ case 4:
+ return "green";
+ case 5:
+ return "magenta";
+ case 6:
+ return "red";
+ case 7:
+ return "yellow";
+ case 8:
+ return "white";
+ case 9:
+ return "darkblue";
+ case 10:
+ return "darkcyan";
+ case 11:
+ return "darkgreen";
+ case 12:
+ return "darkmagenta";
+ case 13:
+ return "darkred";
+ case 14:
+ return "darkyellow";
+ case 15:
+ return "darkgray";
+ case 16:
+ return "lightgray";
+ default:
+ return "black";
+ }
+ }
+
+ public static byte getIxchFollow( ListLevel listLevel )
+ {
+ try
+ {
+ Field field = ListLevel.class.getDeclaredField( "_ixchFollow" );
+ field.setAccessible( true );
+ return ( (Byte) field.get( listLevel ) ).byteValue();
+ }
+ catch ( Exception exc )
+ {
+ throw new Error( exc );
+ }
+ }
+
+ public static String getJustification( int js )
+ {
+ switch ( js )
+ {
+ case 0:
+ return "start";
+ case 1:
+ return "center";
+ case 2:
+ return "end";
+ case 3:
+ case 4:
+ return "justify";
+ case 5:
+ return "center";
+ case 6:
+ return "left";
+ case 7:
+ return "start";
+ case 8:
+ return "end";
+ case 9:
+ return "justify";
+ }
+ return "";
+ }
+
+ public static String getListItemNumberLabel( int number, int format )
+ {
+
+ if ( format != 0 )
+ System.err.println( "NYI: toListItemNumberLabel(): " + format );
+
+ return String.valueOf( number );
+ }
+
+ public static SectionProperties getSectionProperties( Section section )
+ {
+ try
+ {
+ Field field = Section.class.getDeclaredField( "_props" );
+ field.setAccessible( true );
+ return (SectionProperties) field.get( section );
+ }
+ catch ( Exception exc )
+ {
+ throw new Error( exc );
+ }
+ }
+
+ static boolean isEmpty( String str )
+ {
+ return str == null || str.length() == 0;
+ }
+
+ static boolean isNotEmpty( String str )
+ {
+ return !isEmpty( str );
+ }
+
+ public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
+ {
+ final FileInputStream istream = new FileInputStream( docFile );
+ try
+ {
+ return loadDoc( istream );
+ }
+ finally
+ {
+ closeQuietly( istream );
+ }
+ }
+
+ public static HWPFDocumentCore loadDoc( InputStream inputStream )
+ throws IOException
+ {
+ final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
+ .verifyAndBuildPOIFS( inputStream );
+ try
+ {
+ return new HWPFDocument( poifsFileSystem );
+ }
+ catch ( OldWordFileFormatException exc )
+ {
+ return new HWPFOldDocument( poifsFileSystem );
+ }
+ }
+
+ public static TableIterator newTableIterator( Range range, int level )
+ {
+ try
+ {
+ Constructor<TableIterator> constructor = TableIterator.class
+ .getDeclaredConstructor( Range.class, int.class );
+ constructor.setAccessible( true );
+ return constructor.newInstance( range, Integer.valueOf( level ) );
+ }
+ catch ( Exception exc )
+ {
+ throw new Error( exc );
+ }
+ }
+
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+public class FoDocumentFacade
+{
+ private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
+
+ protected final Document document;
+ protected final Element layoutMasterSet;
+ protected final Element root;
+
+ public FoDocumentFacade( Document document )
+ {
+ this.document = document;
+
+ root = document.createElementNS( NS_XSLFO, "fo:root" );
+ document.appendChild( root );
+
+ layoutMasterSet = document.createElementNS( NS_XSLFO,
+ "fo:layout-master-set" );
+ root.appendChild( layoutMasterSet );
+ }
+
+ public Element addFlowToPageSequence( final Element pageSequence,
+ String flowName )
+ {
+ final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" );
+ flow.setAttribute( "flow-name", flowName );
+ pageSequence.appendChild( flow );
+
+ return flow;
+ }
+
+ public Element addListItem( Element listBlock )
+ {
+ Element result = createListItem();
+ listBlock.appendChild( result );
+ return result;
+ }
+
+ public Element addListItemBody( Element listItem )
+ {
+ Element result = createListItemBody();
+ listItem.appendChild( result );
+ return result;
+ }
+
+ public Element addListItemLabel( Element listItem, String text )
+ {
+ Element result = createListItemLabel( text );
+ listItem.appendChild( result );
+ return result;
+ }
+
+ public Element addPageSequence( String pageMaster )
+ {
+ final Element pageSequence = document.createElementNS( NS_XSLFO,
+ "fo:page-sequence" );
+ pageSequence.setAttribute( "master-reference", pageMaster );
+ root.appendChild( pageSequence );
+ return pageSequence;
+ }
+
+ public Element addRegionBody( Element pageMaster )
+ {
+ final Element regionBody = document.createElementNS( NS_XSLFO,
+ "fo:region-body" );
+ pageMaster.appendChild( regionBody );
+
+ return regionBody;
+ }
+
+ public Element addSimplePageMaster( String masterName )
+ {
+ final Element simplePageMaster = document.createElementNS( NS_XSLFO,
+ "fo:simple-page-master" );
+ simplePageMaster.setAttribute( "master-name", masterName );
+ layoutMasterSet.appendChild( simplePageMaster );
+
+ return simplePageMaster;
+ }
+
+ protected Element createBasicLinkExternal( String externalDestination )
+ {
+ final Element basicLink = document.createElementNS( NS_XSLFO,
+ "fo:basic-link" );
+ basicLink.setAttribute( "external-destination", externalDestination );
+ return basicLink;
+ }
+
+ public Element createBasicLinkInternal( String internalDestination )
+ {
+ final Element basicLink = document.createElementNS( NS_XSLFO,
+ "fo:basic-link" );
+ basicLink.setAttribute( "internal-destination", internalDestination );
+ return basicLink;
+ }
+
+ public Element createBlock()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:block" );
+ }
+
+ public Element createExternalGraphic( String source )
+ {
+ Element result = document.createElementNS( NS_XSLFO,
+ "fo:external-graphic" );
+ result.setAttribute( "src", "url('" + source + "')" );
+ return result;
+ }
+
+ public Element createInline()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:inline" );
+ }
+
+ public Element createLeader()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:leader" );
+ }
+
+ public Element createListBlock()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:list-block" );
+ }
+
+ public Element createListItem()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:list-item" );
+ }
+
+ public Element createListItemBody()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:list-item-body" );
+ }
+
+ public Element createListItemLabel( String text )
+ {
+ Element result = document.createElementNS( NS_XSLFO,
+ "fo:list-item-label" );
+ Element block = createBlock();
+ block.appendChild( document.createTextNode( text ) );
+ result.appendChild( block );
+ return result;
+ }
+
+ protected Element createTable()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:table" );
+ }
+
+ protected Element createTableBody()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:table-body" );
+ }
+
+ protected Element createTableCell()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:table-cell" );
+ }
+
+ protected Element createTableHeader()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:table-header" );
+ }
+
+ protected Element createTableRow()
+ {
+ return document.createElementNS( NS_XSLFO, "fo:table-row" );
+ }
+
+ protected Text createText( String data )
+ {
+ return document.createTextNode( data );
+ }
+
+ public Document getDocument()
+ {
+ return document;
+ }
+
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+public class HtmlDocumentFacade
+{
+
+ protected final Element body;
+ protected final Document document;
+ protected final Element head;
+ protected final Element html;
+
+ public HtmlDocumentFacade( Document document )
+ {
+ this.document = document;
+
+ html = document.createElement( "html" );
+ document.appendChild( html );
+
+ body = document.createElement( "body" );
+ head = document.createElement( "head" );
+
+ html.appendChild( head );
+ html.appendChild( body );
+ }
+
+ public Element createHyperlink( String internalDestination )
+ {
+ final Element basicLink = document.createElement( "a" );
+ basicLink.setAttribute( "href", internalDestination );
+ return basicLink;
+ }
+
+ public Element createListItem()
+ {
+ return document.createElement( "li" );
+ }
+
+ public Element createParagraph()
+ {
+ return document.createElement( "p" );
+ }
+
+ public Element createTable()
+ {
+ return document.createElement( "table" );
+ }
+
+ public Element createTableBody()
+ {
+ return document.createElement( "tbody" );
+ }
+
+ public Element createTableCell()
+ {
+ return document.createElement( "td" );
+ }
+
+ public Element createTableHeader()
+ {
+ return document.createElement( "thead" );
+ }
+
+ public Element createTableHeaderCell()
+ {
+ return document.createElement( "th" );
+ }
+
+ public Element createTableRow()
+ {
+ return document.createElement( "tr" );
+ }
+
+ public Text createText( String data )
+ {
+ return document.createTextNode( data );
+ }
+
+ public Element createUnorderedList()
+ {
+ return document.createElement( "ul" );
+ }
+
+ public Document getDocument()
+ {
+ return document;
+ }
+
+}
-/*
- * ====================================================================
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ====================================================================
- */
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.File;
-import java.io.FileInputStream;
import java.io.FileWriter;
-import java.io.IOException;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Stack;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.model.ListFormatOverride;
import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.BorderCode;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
-import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH;
-
/**
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
*/
-public class WordToFoExtractor extends AbstractToFoExtractor
+public class WordToFoExtractor extends AbstractWordExtractor
{
/**
}
}
- private static final byte BEL_MARK = 7;
-
- private static final byte FIELD_BEGIN_MARK = 19;
-
- private static final byte FIELD_END_MARK = 21;
-
- private static final byte FIELD_SEPARATOR_MARK = 20;
-
private static final POILogger logger = POILogFactory
.getLogger( WordToFoExtractor.class );
- private static HWPFDocument loadDoc( File docFile ) throws IOException
+ public static String getBorderType( BorderCode borderCode )
{
- final FileInputStream istream = new FileInputStream( docFile );
- try
+ if ( borderCode == null )
+ throw new IllegalArgumentException( "borderCode is null" );
+
+ switch ( borderCode.getBorderType() )
{
- return new HWPFDocument( istream );
- }
- finally
- {
- try
- {
- istream.close();
- }
- catch ( Exception exc )
- {
- logger.log( POILogger.ERROR,
- "Unable to close FileInputStream: " + exc, exc );
- }
+ case 1:
+ case 2:
+ return "solid";
+ case 3:
+ return "double";
+ case 5:
+ return "solid";
+ case 6:
+ return "dotted";
+ case 7:
+ case 8:
+ return "dashed";
+ case 9:
+ return "dotted";
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ case 17:
+ case 18:
+ case 19:
+ return "double";
+ case 20:
+ return "solid";
+ case 21:
+ return "double";
+ case 22:
+ return "dashed";
+ case 23:
+ return "dashed";
+ case 24:
+ return "ridge";
+ case 25:
+ return "grooved";
+ default:
+ return "solid";
}
}
static Document process( File docFile ) throws Exception
{
- final HWPFDocument hwpfDocument = loadDoc( docFile );
+ final HWPFDocumentCore hwpfDocument = WordToFoUtils.loadDoc( docFile );
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
+ protected final FoDocumentFacade foDocumentFacade;
+
/**
* Creates new instance of {@link WordToFoExtractor}. Can be used for output
* several {@link HWPFDocument}s into single FO document.
*/
public WordToFoExtractor( Document document )
{
- super( document );
+ this.foDocumentFacade = new FoDocumentFacade( document );
}
protected String createPageMaster( SectionProperties sep, String type,
int section )
{
- float height = sep.getYaPage() / TWIPS_PER_INCH;
- float width = sep.getXaPage() / TWIPS_PER_INCH;
- float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
- float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH;
- float topMargin = sep.getDyaTop() / TWIPS_PER_INCH;
- float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH;
+ float height = sep.getYaPage() / WordToFoUtils.TWIPS_PER_INCH;
+ float width = sep.getXaPage() / WordToFoUtils.TWIPS_PER_INCH;
+ float leftMargin = sep.getDxaLeft() / WordToFoUtils.TWIPS_PER_INCH;
+ float rightMargin = sep.getDxaRight() / WordToFoUtils.TWIPS_PER_INCH;
+ float topMargin = sep.getDyaTop() / WordToFoUtils.TWIPS_PER_INCH;
+ float bottomMargin = sep.getDyaBottom() / WordToFoUtils.TWIPS_PER_INCH;
// add these to the header
String pageMasterName = type + "-page" + section;
- Element pageMaster = addSimplePageMaster( pageMasterName );
+ Element pageMaster = foDocumentFacade
+ .addSimplePageMaster( pageMasterName );
pageMaster.setAttribute( "page-height", height + "in" );
pageMaster.setAttribute( "page-width", width + "in" );
- Element regionBody = addRegionBody( pageMaster );
+ Element regionBody = foDocumentFacade.addRegionBody( pageMaster );
regionBody.setAttribute( "margin", topMargin + "in " + rightMargin
+ "in " + bottomMargin + "in " + leftMargin + "in" );
if ( sep.getCcolM1() > 0 )
{
- regionBody
- .setAttribute( "column-count", "" + (sep.getCcolM1() + 1) );
+ regionBody.setAttribute( "column-count", ""
+ + ( sep.getCcolM1() + 1 ) );
if ( sep.getFEvenlySpaced() )
{
regionBody.setAttribute( "column-gap",
- (sep.getDxaColumns() / TWIPS_PER_INCH) + "in" );
+ ( sep.getDxaColumns() / WordToFoUtils.TWIPS_PER_INCH )
+ + "in" );
}
else
{
return pageMasterName;
}
- protected boolean processCharacters( HWPFDocument hwpfDocument,
- int currentTableLevel, Paragraph paragraph, final Element block,
- final int start, final int end )
+ public Document getDocument()
{
- boolean haveAnyText = false;
-
- for ( int c = start; c < end; c++ )
- {
- CharacterRun characterRun = paragraph.getCharacterRun( c );
-
- if ( hwpfDocument.getPicturesTable().hasPicture( characterRun ) )
- {
- Picture picture = hwpfDocument.getPicturesTable()
- .extractPicture( characterRun, true );
-
- processImage( block, characterRun.text().charAt( 0 ) == 0x01,
- picture );
- continue;
- }
-
- String text = characterRun.text();
- if ( text.getBytes().length == 0 )
- continue;
-
- if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
- {
- int skipTo = tryField( hwpfDocument, paragraph,
- currentTableLevel, c, block );
-
- if ( skipTo != c )
- {
- c = skipTo;
- continue;
- }
-
- continue;
- }
- if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
- {
- // shall not appear without FIELD_BEGIN_MARK
- continue;
- }
- if ( text.getBytes()[0] == FIELD_END_MARK )
- {
- // shall not appear without FIELD_BEGIN_MARK
- continue;
- }
-
- if ( characterRun.isSpecialCharacter() || characterRun.isObj()
- || characterRun.isOle2() )
- {
- continue;
- }
-
- BlockProperies blockProperies = this.blocksProperies.peek();
- Element inline = createInline();
- if ( characterRun.isBold() != blockProperies.pBold )
- {
- WordToFoUtils.setBold( inline, characterRun.isBold() );
- }
- if ( characterRun.isItalic() != blockProperies.pItalic )
- {
- WordToFoUtils.setItalic( inline, characterRun.isItalic() );
- }
- if ( !WordToFoUtils.equals( characterRun.getFontName(),
- blockProperies.pFontName ) )
- {
- WordToFoUtils
- .setFontFamily( inline, characterRun.getFontName() );
- }
- if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
- {
- WordToFoUtils.setFontSize( inline,
- characterRun.getFontSize() / 2 );
- }
- WordToFoUtils.setCharactersProperties( characterRun, inline );
- block.appendChild( inline );
-
- if ( text.endsWith( "\r" )
- || (text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0) )
- text = text.substring( 0, text.length() - 1 );
-
- Text textNode = createText( text );
- inline.appendChild( textNode );
-
- haveAnyText |= text.trim().length() != 0;
- }
-
- return haveAnyText;
+ return foDocumentFacade.getDocument();
}
- public void processDocument( HWPFDocument hwpfDocument )
+ @Override
+ protected void outputCharacters( Element block, CharacterRun characterRun,
+ String text )
{
- final Range range = hwpfDocument.getRange();
-
- for ( int s = 0; s < range.numSections(); s++ )
+ BlockProperies blockProperies = this.blocksProperies.peek();
+ Element inline = foDocumentFacade.createInline();
+ if ( characterRun.isBold() != blockProperies.pBold )
{
- processSection( hwpfDocument, range.getSection( s ), s );
+ WordToFoUtils.setBold( inline, characterRun.isBold() );
}
- }
-
- protected void processField( HWPFDocument hwpfDocument,
- Element currentBlock, Paragraph paragraph, int currentTableLevel,
- int beginMark, int separatorMark, int endMark )
- {
-
- Pattern hyperlinkPattern = Pattern
- .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
- Pattern pagerefPattern = Pattern
- .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
-
- if ( separatorMark - beginMark > 1 )
+ if ( characterRun.isItalic() != blockProperies.pItalic )
{
- CharacterRun firstAfterBegin = paragraph
- .getCharacterRun( beginMark + 1 );
-
- final Matcher hyperlinkMatcher = hyperlinkPattern
- .matcher( firstAfterBegin.text() );
- if ( hyperlinkMatcher.matches() )
- {
- String hyperlink = hyperlinkMatcher.group( 1 );
- processHyperlink( hwpfDocument, currentBlock, paragraph,
- currentTableLevel, hyperlink, separatorMark + 1,
- endMark );
- return;
- }
-
- final Matcher pagerefMatcher = pagerefPattern
- .matcher( firstAfterBegin.text() );
- if ( pagerefMatcher.matches() )
- {
- String pageref = pagerefMatcher.group( 1 );
- processPageref( hwpfDocument, currentBlock, paragraph,
- currentTableLevel, pageref, separatorMark + 1, endMark );
- return;
- }
+ WordToFoUtils.setItalic( inline, characterRun.isItalic() );
}
-
- StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
- for ( int i = beginMark; i <= endMark; i++ )
+ if ( characterRun.getFontName() != null
+ && !AbstractWordUtils.equals( characterRun.getFontName(),
+ blockProperies.pFontName ) )
{
- debug.append( "\t" );
- debug.append( paragraph.getCharacterRun( i ) );
- debug.append( "\n" );
+ WordToFoUtils.setFontFamily( inline, characterRun.getFontName() );
}
- logger.log( POILogger.WARN, debug );
-
- // just output field value
- if ( separatorMark + 1 < endMark )
- processCharacters( hwpfDocument, currentTableLevel, paragraph,
- currentBlock, separatorMark + 1, endMark );
+ if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
+ {
+ WordToFoUtils.setFontSize( inline, characterRun.getFontSize() / 2 );
+ }
+ WordToFoUtils.setCharactersProperties( characterRun, inline );
+ block.appendChild( inline );
- return;
+ Text textNode = foDocumentFacade.createText( text );
+ inline.appendChild( textNode );
}
- protected void processHyperlink( HWPFDocument hwpfDocument,
- Element currentBlock, Paragraph paragraph, int currentTableLevel,
+ protected void processHyperlink( HWPFDocumentCore hwpfDocument,
+ Element currentBlock, Paragraph paragraph,
+ List<CharacterRun> characterRuns, int currentTableLevel,
String hyperlink, int beginTextInclusive, int endTextExclusive )
{
- Element basicLink = createBasicLinkExternal( hyperlink );
+ Element basicLink = foDocumentFacade
+ .createBasicLinkExternal( hyperlink );
currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
- basicLink, beginTextInclusive, endTextExclusive );
+ basicLink, characterRuns, beginTextInclusive,
+ endTextExclusive );
}
/**
Picture picture )
{
// no default implementation -- skip
- currentBlock.appendChild( document.createComment( "Image link to '"
- + picture.suggestFullFileName() + "' can be here" ) );
+ currentBlock.appendChild( foDocumentFacade.getDocument().createComment(
+ "Image link to '" + picture.suggestFullFileName()
+ + "' can be here" ) );
}
- protected void processPageref( HWPFDocument hwpfDocument,
- Element currentBlock, Paragraph paragraph, int currentTableLevel,
+ protected void processPageref( HWPFDocumentCore hwpfDocument,
+ Element currentBlock, Paragraph paragraph,
+ List<CharacterRun> characterRuns, int currentTableLevel,
String pageref, int beginTextInclusive, int endTextExclusive )
{
- Element basicLink = createBasicLinkInternal( pageref );
+ Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref );
currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
- basicLink, beginTextInclusive, endTextExclusive );
+ basicLink, characterRuns, beginTextInclusive,
+ endTextExclusive );
}
- protected void processParagraph( HWPFDocument hwpfDocument,
+ protected void processParagraph( HWPFDocumentCore hwpfDocument,
Element parentFopElement, int currentTableLevel,
Paragraph paragraph, String bulletText )
{
- final Element block = createBlock();
+ final Element block = foDocumentFacade.createBlock();
parentFopElement.appendChild( block );
WordToFoUtils.setParagraphProperties( paragraph, block );
if ( WordToFoUtils.isNotEmpty( bulletText ) )
{
- Element inline = createInline();
+ Element inline = foDocumentFacade.createInline();
block.appendChild( inline );
- Text textNode = createText( bulletText );
+ Text textNode = foDocumentFacade.createText( bulletText );
inline.appendChild( textNode );
haveAnyText |= bulletText.trim().length() != 0;
}
+ List<CharacterRun> characterRuns = WordToFoUtils
+ .findCharacterRuns( paragraph );
haveAnyText = processCharacters( hwpfDocument, currentTableLevel,
- paragraph, block, 0, charRuns );
+ paragraph, block, characterRuns, 0, characterRuns.size() );
if ( !haveAnyText )
{
- Element leader = createLeader();
+ Element leader = foDocumentFacade.createLeader();
block.appendChild( leader );
}
}
return;
}
- protected void processSection( HWPFDocument hwpfDocument, Section section,
- int sectionCounter )
+ protected void processSection( HWPFDocumentCore wordDocument,
+ Section section, int sectionCounter )
{
String regularPage = createPageMaster(
WordToFoUtils.getSectionProperties( section ), "page",
sectionCounter );
- Element pageSequence = addPageSequence( regularPage );
- Element flow = addFlowToPageSequence( pageSequence, "xsl-region-body" );
+ Element pageSequence = foDocumentFacade.addPageSequence( regularPage );
+ Element flow = foDocumentFacade.addFlowToPageSequence( pageSequence,
+ "xsl-region-body" );
- processSectionParagraphes( hwpfDocument, flow, section, 0 );
+ processSectionParagraphes( wordDocument, flow, section, 0 );
}
- protected void processSectionParagraphes( HWPFDocument hwpfDocument,
+ protected void processSectionParagraphes( HWPFDocument wordDocument,
Element flow, Range range, int currentTableLevel )
{
final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
allTables.put( Integer.valueOf( next.getStartOffset() ), next );
}
- final ListTables listTables = hwpfDocument.getListTables();
+ final ListTables listTables = wordDocument.getListTables();
int currentListInfo = 0;
final int paragraphs = range.numParagraphs();
{
Table table = allTables.get( Integer.valueOf( paragraph
.getStartOffset() ) );
- processTable( hwpfDocument, flow, table, currentTableLevel + 1 );
+ processTable( wordDocument, flow, table, currentTableLevel + 1 );
continue;
}
String label = WordToFoUtils.getBulletText( listTables,
paragraph, listFormatOverride.getLsid() );
- processParagraph( hwpfDocument, flow, currentTableLevel,
+ processParagraph( wordDocument, flow, currentTableLevel,
paragraph, label );
}
else
+ currentListInfo
+ ", but listTables not defined in file" );
- processParagraph( hwpfDocument, flow, currentTableLevel,
+ processParagraph( wordDocument, flow, currentTableLevel,
paragraph, WordToFoUtils.EMPTY );
}
}
else
{
- processParagraph( hwpfDocument, flow, currentTableLevel,
+ processParagraph( wordDocument, flow, currentTableLevel,
paragraph, WordToFoUtils.EMPTY );
}
}
}
- protected void processTable( HWPFDocument hwpfDocument, Element flow,
+ protected void processTable( HWPFDocumentCore wordDocument, Element flow,
Table table, int thisTableLevel )
{
- Element tableHeader = createTableHeader();
- Element tableBody = createTableBody();
+ Element tableHeader = foDocumentFacade.createTableHeader();
+ Element tableBody = foDocumentFacade.createTableBody();
final int tableRows = table.numRows();
{
TableRow tableRow = table.getRow( r );
- Element tableRowElement = createTableRow();
+ Element tableRowElement = foDocumentFacade.createTableRow();
WordToFoUtils.setTableRowProperties( tableRow, tableRowElement );
final int rowCells = tableRow.numCells();
&& !tableCell.isFirstVerticallyMerged() )
continue;
- Element tableCellElement = createTableCell();
+ Element tableCellElement = foDocumentFacade.createTableCell();
WordToFoUtils.setTableCellProperties( tableRow, tableCell,
tableCellElement, r == 0, r == tableRows - 1, c == 0,
c == rowCells - 1 );
{
if ( c == rowCells - 1 && c != maxColumns - 1 )
{
- tableCellElement
- .setAttribute( "number-columns-spanned", ""
- + (maxColumns - c) );
+ tableCellElement.setAttribute(
+ "number-columns-spanned", ""
+ + ( maxColumns - c ) );
}
}
+ count );
}
- processSectionParagraphes( hwpfDocument, tableCellElement,
+ processSectionParagraphes( wordDocument, tableCellElement,
tableCell, thisTableLevel );
if ( !tableCellElement.hasChildNodes() )
{
- tableCellElement.appendChild( createBlock() );
+ tableCellElement.appendChild( foDocumentFacade
+ .createBlock() );
}
tableRowElement.appendChild( tableCellElement );
}
}
- final Element tableElement = createTable();
+ final Element tableElement = foDocumentFacade.createTable();
if ( tableHeader.hasChildNodes() )
{
tableElement.appendChild( tableHeader );
}
}
- protected int tryField( HWPFDocument hwpfDocument, Paragraph paragraph,
- int currentTableLevel, int beginMark, Element currentBlock )
- {
- int separatorMark = -1;
- int endMark = -1;
- for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
- {
- CharacterRun characterRun = paragraph.getCharacterRun( c );
-
- String text = characterRun.text();
- if ( text.getBytes().length == 0 )
- continue;
-
- if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
- {
- if ( separatorMark != -1 )
- {
- // double;
- return beginMark;
- }
-
- separatorMark = c;
- continue;
- }
-
- if ( text.getBytes()[0] == FIELD_END_MARK )
- {
- if ( endMark != -1 )
- {
- // double;
- return beginMark;
- }
-
- endMark = c;
- break;
- }
-
- }
-
- if ( separatorMark == -1 || endMark == -1 )
- return beginMark;
-
- processField( hwpfDocument, currentBlock, paragraph, currentTableLevel,
- beginMark, separatorMark, endMark );
-
- return endMark;
- }
}
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
package org.apache.poi.hwpf.extractor;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.Field;
-
-import org.apache.poi.hwpf.model.ListLevel;
-import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.usermodel.BorderCode;
import org.apache.poi.hwpf.usermodel.CharacterProperties;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
-import org.apache.poi.hwpf.usermodel.Range;
-import org.apache.poi.hwpf.usermodel.Section;
-import org.apache.poi.hwpf.usermodel.SectionProperties;
import org.apache.poi.hwpf.usermodel.TableCell;
-import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.w3c.dom.Element;
-public class WordToFoUtils {
- static final String EMPTY = "";
-
- public static final float TWIPS_PER_INCH = 1440.0f;
-
- public static final int TWIPS_PER_PT = 20;
-
- static boolean equals(String str1, String str2) {
- return str1 == null ? str2 == null : str1.equals(str2);
- }
-
- public static String getBorderType(BorderCode borderCode) {
- if (borderCode == null)
- throw new IllegalArgumentException("borderCode is null");
-
- switch (borderCode.getBorderType()) {
- case 1:
- case 2:
- return "solid";
- case 3:
- return "double";
- case 5:
- return "solid";
- case 6:
- return "dotted";
- case 7:
- case 8:
- return "dashed";
- case 9:
- return "dotted";
- case 10:
- case 11:
- case 12:
- case 13:
- case 14:
- case 15:
- case 16:
- case 17:
- case 18:
- case 19:
- return "double";
- case 20:
- return "solid";
- case 21:
- return "double";
- case 22:
- return "dashed";
- case 23:
- return "dashed";
- case 24:
- return "ridge";
- case 25:
- return "grooved";
- default:
- return "solid";
- }
- }
-
- public static String getBorderWidth(BorderCode borderCode) {
- int lineWidth = borderCode.getLineWidth();
- int pt = lineWidth / 8;
- int pte = lineWidth - pt * 8;
-
- StringBuilder stringBuilder = new StringBuilder();
- stringBuilder.append(pt);
- stringBuilder.append(".");
- stringBuilder.append(1000 / 8 * pte);
- stringBuilder.append("pt");
- return stringBuilder.toString();
- }
-
- public static String getBulletText(ListTables listTables,
- Paragraph paragraph, int listId) {
- final ListLevel listLevel = listTables.getLevel(listId,
- paragraph.getIlvl());
-
- if (listLevel.getNumberText() == null)
- return EMPTY;
-
- StringBuffer bulletBuffer = new StringBuffer();
- char[] xst = listLevel.getNumberText().toCharArray();
- for (char element : xst) {
- if (element < 9) {
- ListLevel numLevel = listTables.getLevel(listId, element);
-
- int num = numLevel.getStartAt();
- bulletBuffer.append(NumberFormatter.getNumber(num,
- listLevel.getNumberFormat()));
-
- if (numLevel == listLevel) {
- numLevel.setStartAt(numLevel.getStartAt() + 1);
- }
-
- } else {
- bulletBuffer.append(element);
- }
- }
-
- byte follow = getIxchFollow(listLevel);
- switch (follow) {
- case 0:
- bulletBuffer.append("\t");
- break;
- case 1:
- bulletBuffer.append(" ");
- break;
- default:
- break;
- }
-
- return bulletBuffer.toString();
- }
-
- public static String getColor(int ico) {
- switch (ico) {
- case 1:
- return "black";
- case 2:
- return "blue";
- case 3:
- return "cyan";
- case 4:
- return "green";
- case 5:
- return "magenta";
- case 6:
- return "red";
- case 7:
- return "yellow";
- case 8:
- return "white";
- case 9:
- return "darkblue";
- case 10:
- return "darkcyan";
- case 11:
- return "darkgreen";
- case 12:
- return "darkmagenta";
- case 13:
- return "darkred";
- case 14:
- return "darkyellow";
- case 15:
- return "darkgray";
- case 16:
- return "lightgray";
- default:
- return "black";
- }
+public class WordToFoUtils extends AbstractWordUtils
+{
+ public static void setBold( final Element element, final boolean bold )
+ {
+ element.setAttribute( "font-weight", bold ? "bold" : "normal" );
}
- public static byte getIxchFollow(ListLevel listLevel) {
- try {
- Field field = ListLevel.class.getDeclaredField("_ixchFollow");
- field.setAccessible(true);
- return ((Byte) field.get(listLevel)).byteValue();
- } catch (Exception exc) {
- throw new Error(exc);
- }
- }
-
- public static String getJustification(int js) {
- switch (js) {
- case 0:
- return "start";
- case 1:
- return "center";
- case 2:
- return "end";
- case 3:
- case 4:
- return "justify";
- case 5:
- return "center";
- case 6:
- return "left";
- case 7:
- return "start";
- case 8:
- return "end";
- case 9:
- return "justify";
+ public static void setBorder( Element element, BorderCode borderCode,
+ String where )
+ {
+ if ( element == null )
+ throw new IllegalArgumentException( "element is null" );
+
+ if ( borderCode == null || borderCode.getBorderType() == 0 )
+ return;
+
+ if ( isEmpty( where ) )
+ {
+ element.setAttribute( "border-style", getBorderType( borderCode ) );
+ element.setAttribute( "border-color",
+ getColor( borderCode.getColor() ) );
+ element.setAttribute( "border-width", getBorderWidth( borderCode ) );
+ }
+ else
+ {
+ element.setAttribute( "border-" + where + "-style",
+ getBorderType( borderCode ) );
+ element.setAttribute( "border-" + where + "-color",
+ getColor( borderCode.getColor() ) );
+ element.setAttribute( "border-" + where + "-width",
+ getBorderWidth( borderCode ) );
}
- return "";
- }
-
- public static String getListItemNumberLabel(int number, int format) {
-
- if (format != 0)
- System.err.println("NYI: toListItemNumberLabel(): " + format);
-
- return String.valueOf(number);
- }
-
- public static SectionProperties getSectionProperties(Section section) {
- try {
- Field field = Section.class.getDeclaredField("_props");
- field.setAccessible(true);
- return (SectionProperties) field.get(section);
- } catch (Exception exc) {
- throw new Error(exc);
- }
- }
-
- static boolean isEmpty(String str) {
- return str == null || str.length() == 0;
- }
-
- static boolean isNotEmpty(String str) {
- return !isEmpty(str);
- }
-
- public static TableIterator newTableIterator(Range range, int level) {
- try {
- Constructor<TableIterator> constructor = TableIterator.class
- .getDeclaredConstructor(Range.class, int.class);
- constructor.setAccessible(true);
- return constructor.newInstance(range, Integer.valueOf(level));
- } catch (Exception exc) {
- throw new Error(exc);
- }
- }
-
- public static void setBold(final Element element, final boolean bold) {
- element.setAttribute("font-weight", bold ? "bold" : "normal");
- }
-
- public static void setBorder(Element element, BorderCode borderCode,
- String where) {
- if (element == null)
- throw new IllegalArgumentException("element is null");
-
- if (borderCode == null)
- return;
-
- if (isEmpty(where)) {
- element.setAttribute("border-style", getBorderType(borderCode));
- element.setAttribute("border-color",
- getColor(borderCode.getColor()));
- element.setAttribute("border-width", getBorderWidth(borderCode));
- } else {
- element.setAttribute("border-" + where + "-style",
- getBorderType(borderCode));
- element.setAttribute("border-" + where + "-color",
- getColor(borderCode.getColor()));
- element.setAttribute("border-" + where + "-width",
- getBorderWidth(borderCode));
- }
}
- public static void setCharactersProperties(final CharacterRun characterRun,
- final Element inline) {
+ public static void setCharactersProperties(
+ final CharacterRun characterRun, final Element inline )
+ {
final CharacterProperties clonedProperties = characterRun
.cloneProperties();
StringBuilder textDecorations = new StringBuilder();
- setBorder(inline, clonedProperties.getBrc(), EMPTY);
+ setBorder( inline, clonedProperties.getBrc(), EMPTY );
- if (characterRun.isCapitalized()) {
- inline.setAttribute("text-transform", "uppercase");
+ if ( characterRun.isCapitalized() )
+ {
+ inline.setAttribute( "text-transform", "uppercase" );
}
- if (characterRun.isHighlighted()) {
- inline.setAttribute("background-color",
- getColor(clonedProperties.getIcoHighlight()));
+ if ( characterRun.isHighlighted() )
+ {
+ inline.setAttribute( "background-color",
+ getColor( clonedProperties.getIcoHighlight() ) );
}
- if (characterRun.isStrikeThrough()) {
- if (textDecorations.length() > 0)
- textDecorations.append(" ");
- textDecorations.append("line-through");
+ if ( characterRun.isStrikeThrough() )
+ {
+ if ( textDecorations.length() > 0 )
+ textDecorations.append( " " );
+ textDecorations.append( "line-through" );
}
- if (characterRun.isShadowed()) {
- inline.setAttribute("text-shadow", characterRun.getFontSize() / 24
- + "pt");
+ if ( characterRun.isShadowed() )
+ {
+ inline.setAttribute( "text-shadow", characterRun.getFontSize() / 24
+ + "pt" );
}
- if (characterRun.isSmallCaps()) {
- inline.setAttribute("font-variant", "small-caps");
+ if ( characterRun.isSmallCaps() )
+ {
+ inline.setAttribute( "font-variant", "small-caps" );
}
- if (characterRun.getSubSuperScriptIndex() == 1) {
- inline.setAttribute("baseline-shift", "super");
- inline.setAttribute("font-size", "smaller");
+ if ( characterRun.getSubSuperScriptIndex() == 1 )
+ {
+ inline.setAttribute( "baseline-shift", "super" );
+ inline.setAttribute( "font-size", "smaller" );
}
- if (characterRun.getSubSuperScriptIndex() == 2) {
- inline.setAttribute("baseline-shift", "sub");
- inline.setAttribute("font-size", "smaller");
+ if ( characterRun.getSubSuperScriptIndex() == 2 )
+ {
+ inline.setAttribute( "baseline-shift", "sub" );
+ inline.setAttribute( "font-size", "smaller" );
}
- if (characterRun.getUnderlineCode() > 0) {
- if (textDecorations.length() > 0)
- textDecorations.append(" ");
- textDecorations.append("underline");
+ if ( characterRun.getUnderlineCode() > 0 )
+ {
+ if ( textDecorations.length() > 0 )
+ textDecorations.append( " " );
+ textDecorations.append( "underline" );
}
- if (characterRun.isVanished()) {
- inline.setAttribute("visibility", "hidden");
+ if ( characterRun.isVanished() )
+ {
+ inline.setAttribute( "visibility", "hidden" );
}
- if (textDecorations.length() > 0) {
- inline.setAttribute("text-decoration", textDecorations.toString());
+ if ( textDecorations.length() > 0 )
+ {
+ inline.setAttribute( "text-decoration", textDecorations.toString() );
}
}
- public static void setFontFamily(final Element element,
- final String fontFamily) {
- element.setAttribute("font-family", fontFamily);
+ public static void setFontFamily( final Element element,
+ final String fontFamily )
+ {
+ if ( isEmpty( fontFamily ) )
+ return;
+
+ element.setAttribute( "font-family", fontFamily );
}
- public static void setFontSize(final Element element, final int fontSize) {
- element.setAttribute("font-size", String.valueOf(fontSize));
+ public static void setFontSize( final Element element, final int fontSize )
+ {
+ element.setAttribute( "font-size", String.valueOf( fontSize ) );
}
- public static void setIndent(Paragraph paragraph, Element block) {
- if (paragraph.getFirstLineIndent() != 0) {
- block.setAttribute(
- "text-indent",
- String.valueOf(paragraph.getFirstLineIndent()
- / TWIPS_PER_PT)
- + "pt");
- }
- if (paragraph.getIndentFromLeft() != 0) {
- block.setAttribute(
- "start-indent",
- String.valueOf(paragraph.getIndentFromLeft() / TWIPS_PER_PT)
- + "pt");
- }
- if (paragraph.getIndentFromRight() != 0) {
- block.setAttribute(
- "end-indent",
- String.valueOf(paragraph.getIndentFromRight()
- / TWIPS_PER_PT)
- + "pt");
- }
- if (paragraph.getSpacingBefore() != 0) {
- block.setAttribute("space-before",
- String.valueOf(paragraph.getSpacingBefore() / TWIPS_PER_PT)
- + "pt");
- }
- if (paragraph.getSpacingAfter() != 0) {
- block.setAttribute("space-after",
- String.valueOf(paragraph.getSpacingAfter() / TWIPS_PER_PT)
- + "pt");
- }
+ public static void setIndent( Paragraph paragraph, Element block )
+ {
+ if ( paragraph.getFirstLineIndent() != 0 )
+ {
+ block.setAttribute(
+ "text-indent",
+ String.valueOf( paragraph.getFirstLineIndent()
+ / TWIPS_PER_PT )
+ + "pt" );
+ }
+ if ( paragraph.getIndentFromLeft() != 0 )
+ {
+ block.setAttribute(
+ "start-indent",
+ String.valueOf( paragraph.getIndentFromLeft()
+ / TWIPS_PER_PT )
+ + "pt" );
+ }
+ if ( paragraph.getIndentFromRight() != 0 )
+ {
+ block.setAttribute(
+ "end-indent",
+ String.valueOf( paragraph.getIndentFromRight()
+ / TWIPS_PER_PT )
+ + "pt" );
+ }
+ if ( paragraph.getSpacingBefore() != 0 )
+ {
+ block.setAttribute(
+ "space-before",
+ String.valueOf( paragraph.getSpacingBefore() / TWIPS_PER_PT )
+ + "pt" );
+ }
+ if ( paragraph.getSpacingAfter() != 0 )
+ {
+ block.setAttribute( "space-after",
+ String.valueOf( paragraph.getSpacingAfter() / TWIPS_PER_PT )
+ + "pt" );
+ }
}
- public static void setItalic(final Element element, final boolean italic) {
- element.setAttribute("font-style", italic ? "italic" : "normal");
+ public static void setItalic( final Element element, final boolean italic )
+ {
+ element.setAttribute( "font-style", italic ? "italic" : "normal" );
}
- public static void setJustification(Paragraph paragraph,
- final Element element) {
- String justification = getJustification(paragraph.getJustification());
- if (isNotEmpty(justification))
- element.setAttribute("text-align", justification);
+ public static void setJustification( Paragraph paragraph,
+ final Element element )
+ {
+ String justification = getJustification( paragraph.getJustification() );
+ if ( isNotEmpty( justification ) )
+ element.setAttribute( "text-align", justification );
}
- public static void setParagraphProperties(Paragraph paragraph, Element block) {
- setIndent(paragraph, block);
- setJustification(paragraph, block);
+ public static void setParagraphProperties( Paragraph paragraph,
+ Element block )
+ {
+ setIndent( paragraph, block );
+ setJustification( paragraph, block );
- setBorder(block, paragraph.getBottomBorder(), "bottom");
- setBorder(block, paragraph.getLeftBorder(), "left");
- setBorder(block, paragraph.getRightBorder(), "right");
- setBorder(block, paragraph.getTopBorder(), "top");
+ setBorder( block, paragraph.getBottomBorder(), "bottom" );
+ setBorder( block, paragraph.getLeftBorder(), "left" );
+ setBorder( block, paragraph.getRightBorder(), "right" );
+ setBorder( block, paragraph.getTopBorder(), "top" );
- if (paragraph.pageBreakBefore()) {
- block.setAttribute("break-before", "page");
- }
+ if ( paragraph.pageBreakBefore() )
+ {
+ block.setAttribute( "break-before", "page" );
+ }
- block.setAttribute("hyphenate",
- String.valueOf(paragraph.isAutoHyphenated()));
+ block.setAttribute( "hyphenate",
+ String.valueOf( paragraph.isAutoHyphenated() ) );
- if (paragraph.keepOnPage()) {
- block.setAttribute("keep-together.within-page", "always");
- }
+ if ( paragraph.keepOnPage() )
+ {
+ block.setAttribute( "keep-together.within-page", "always" );
+ }
- if (paragraph.keepWithNext()) {
- block.setAttribute("keep-with-next.within-page", "always");
- }
+ if ( paragraph.keepWithNext() )
+ {
+ block.setAttribute( "keep-with-next.within-page", "always" );
+ }
- block.setAttribute("linefeed-treatment", "preserve");
- block.setAttribute("white-space-collapse", "false");
+ block.setAttribute( "linefeed-treatment", "preserve" );
+ block.setAttribute( "white-space-collapse", "false" );
}
- public static void setPictureProperties(Picture picture,
- Element graphicElement) {
+ public static void setPictureProperties( Picture picture,
+ Element graphicElement )
+ {
final int aspectRatioX = picture.getAspectRatioX();
final int aspectRatioY = picture.getAspectRatioY();
- if (aspectRatioX > 0) {
- graphicElement.setAttribute("content-width", ((picture.getDxaGoal()
- * aspectRatioX / 100) / WordToFoUtils.TWIPS_PER_PT)
- + "pt");
- } else
- graphicElement.setAttribute("content-width",
- (picture.getDxaGoal() / WordToFoUtils.TWIPS_PER_PT) + "pt");
+ if ( aspectRatioX > 0 )
+ {
+ graphicElement
+ .setAttribute( "content-width", ( ( picture.getDxaGoal()
+ * aspectRatioX / 100 ) / TWIPS_PER_PT )
+ + "pt" );
+ }
+ else
+ graphicElement.setAttribute( "content-width",
+ ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" );
- if (aspectRatioY > 0)
+ if ( aspectRatioY > 0 )
graphicElement
- .setAttribute("content-height", ((picture.getDyaGoal()
- * aspectRatioY / 100) / WordToFoUtils.TWIPS_PER_PT)
- + "pt");
+ .setAttribute( "content-height", ( ( picture.getDyaGoal()
+ * aspectRatioY / 100 ) / TWIPS_PER_PT )
+ + "pt" );
else
- graphicElement.setAttribute("content-height",
- (picture.getDyaGoal() / WordToFoUtils.TWIPS_PER_PT) + "pt");
+ graphicElement.setAttribute( "content-height",
+ ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" );
- if (aspectRatioX <= 0 || aspectRatioY <= 0) {
- graphicElement.setAttribute("scaling", "uniform");
- } else {
- graphicElement.setAttribute("scaling", "non-uniform");
+ if ( aspectRatioX <= 0 || aspectRatioY <= 0 )
+ {
+ graphicElement.setAttribute( "scaling", "uniform" );
+ }
+ else
+ {
+ graphicElement.setAttribute( "scaling", "non-uniform" );
}
- graphicElement.setAttribute("vertical-align", "text-bottom");
+ graphicElement.setAttribute( "vertical-align", "text-bottom" );
- if (picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
+ if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
|| picture.getDyaCropBottom() != 0
- || picture.getDxaCropLeft() != 0) {
- int rectTop = picture.getDyaCropTop() / WordToFoUtils.TWIPS_PER_PT;
- int rectRight = picture.getDxaCropRight()
- / WordToFoUtils.TWIPS_PER_PT;
- int rectBottom = picture.getDyaCropBottom()
- / WordToFoUtils.TWIPS_PER_PT;
- int rectLeft = picture.getDxaCropLeft()
- / WordToFoUtils.TWIPS_PER_PT;
- graphicElement.setAttribute("clip", "rect(" + rectTop + "pt, "
+ || picture.getDxaCropLeft() != 0 )
+ {
+ int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT;
+ int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT;
+ int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT;
+ int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT;
+ graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, "
+ rectRight + "pt, " + rectBottom + "pt, " + rectLeft
- + "pt)");
- graphicElement.setAttribute("oveerflow", "hidden");
+ + "pt)" );
+ graphicElement.setAttribute( "oveerflow", "hidden" );
}
}
- public static void setTableCellProperties(TableRow tableRow,
- TableCell tableCell, Element element, boolean toppest,
- boolean bottomest, boolean leftest, boolean rightest) {
- element.setAttribute("width", (tableCell.getWidth() / TWIPS_PER_INCH)
- + "in");
- element.setAttribute("padding-start",
- (tableRow.getGapHalf() / TWIPS_PER_INCH) + "in");
- element.setAttribute("padding-end",
- (tableRow.getGapHalf() / TWIPS_PER_INCH) + "in");
-
- BorderCode top = tableCell.getBrcTop() != null ? tableCell.getBrcTop()
- : toppest ? tableRow.getTopBorder() : tableRow
- .getHorizontalBorder();
- BorderCode bottom = tableCell.getBrcBottom() != null ? tableCell
- .getBrcBottom() : bottomest ? tableRow.getBottomBorder()
- : tableRow.getHorizontalBorder();
-
- BorderCode left = tableCell.getBrcLeft() != null ? tableCell
- .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
- .getVerticalBorder();
- BorderCode right = tableCell.getBrcRight() != null ? tableCell
- .getBrcRight() : rightest ? tableRow.getRightBorder()
- : tableRow.getVerticalBorder();
-
- setBorder(element, bottom, "bottom");
- setBorder(element, left, "left");
- setBorder(element, right, "right");
- setBorder(element, top, "top");
+ public static void setTableCellProperties( TableRow tableRow,
+ TableCell tableCell, Element element, boolean toppest,
+ boolean bottomest, boolean leftest, boolean rightest )
+ {
+ element.setAttribute( "width", ( tableCell.getWidth() / TWIPS_PER_INCH )
+ + "in" );
+ element.setAttribute( "padding-start",
+ ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" );
+ element.setAttribute( "padding-end",
+ ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" );
+
+ BorderCode top = tableCell.getBrcTop() != null
+ && tableCell.getBrcTop().getBorderType() != 0 ? tableCell
+ .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow
+ .getHorizontalBorder();
+ BorderCode bottom = tableCell.getBrcBottom() != null
+ && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell
+ .getBrcBottom() : bottomest ? tableRow.getBottomBorder()
+ : tableRow.getHorizontalBorder();
+
+ BorderCode left = tableCell.getBrcLeft() != null
+ && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell
+ .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
+ .getVerticalBorder();
+ BorderCode right = tableCell.getBrcRight() != null
+ && tableCell.getBrcRight().getBorderType() != 0 ? tableCell
+ .getBrcRight() : rightest ? tableRow.getRightBorder()
+ : tableRow.getVerticalBorder();
+
+ setBorder( element, bottom, "bottom" );
+ setBorder( element, left, "left" );
+ setBorder( element, right, "right" );
+ setBorder( element, top, "top" );
}
- public static void setTableRowProperties(TableRow tableRow,
- Element tableRowElement) {
- if (tableRow.getRowHeight() > 0) {
- tableRowElement.setAttribute("height",
- (tableRow.getRowHeight() / TWIPS_PER_INCH) + "in");
- }
- if (!tableRow.cantSplit()) {
- tableRowElement.setAttribute("keep-together", "always");
- }
+ public static void setTableRowProperties( TableRow tableRow,
+ Element tableRowElement )
+ {
+ if ( tableRow.getRowHeight() > 0 )
+ {
+ tableRowElement.setAttribute( "height",
+ ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in" );
+ }
+ if ( !tableRow.cantSplit() )
+ {
+ tableRowElement.setAttribute( "keep-together", "always" );
+ }
}
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.util.List;
+import java.util.Stack;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.SectionProperties;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+import static org.apache.poi.hwpf.extractor.AbstractWordUtils.TWIPS_PER_INCH;
+
+/**
+ * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
+ */
+public class WordToHtmlExtractor extends AbstractWordExtractor
+{
+
+ /**
+ * Holds properties values, applied to current <tt>p</tt> element. Those
+ * properties shall not be doubled in children <tt>span</tt> elements.
+ */
+ private static class BlockProperies
+ {
+ final String pFontName;
+ final int pFontSize;
+
+ public BlockProperies( String pFontName, int pFontSize )
+ {
+ this.pFontName = pFontName;
+ this.pFontSize = pFontSize;
+ }
+ }
+
+ private static final POILogger logger = POILogFactory
+ .getLogger( WordToHtmlExtractor.class );
+
+ private static String getSectionStyle( Section section )
+ {
+ SectionProperties sep = WordToHtmlUtils.getSectionProperties( section );
+
+ float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
+ float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH;
+ float topMargin = sep.getDyaTop() / TWIPS_PER_INCH;
+ float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH;
+
+ String style = "margin: " + topMargin + "in " + rightMargin + "in "
+ + bottomMargin + "in " + leftMargin + "in; ";
+
+ if ( sep.getCcolM1() > 0 )
+ {
+ style += "column-count: " + ( sep.getCcolM1() + 1 ) + "; ";
+ if ( sep.getFEvenlySpaced() )
+ {
+ style += "column-gap: "
+ + ( sep.getDxaColumns() / TWIPS_PER_INCH ) + "in; ";
+ }
+ else
+ {
+ style += "column-gap: 0.25in; ";
+ }
+ }
+ return style;
+ }
+
+ /**
+ * Java main() interface to interact with WordToHtmlExtractor
+ *
+ * <p>
+ * Usage: WordToHtmlExtractor infile outfile
+ * </p>
+ * Where infile is an input .doc file ( Word 95-2007) which will be rendered
+ * as HTML into outfile
+ */
+ public static void main( String[] args )
+ {
+ if ( args.length < 2 )
+ {
+ System.err
+ .println( "Usage: WordToHtmlExtractor <inputFile.doc> <saveTo.html>" );
+ return;
+ }
+
+ System.out.println( "Converting " + args[0] );
+ System.out.println( "Saving output to " + args[1] );
+ try
+ {
+ Document doc = WordToHtmlExtractor.process( new File( args[0] ) );
+
+ FileWriter out = new FileWriter( args[1] );
+ DOMSource domSource = new DOMSource( doc );
+ StreamResult streamResult = new StreamResult( out );
+
+ TransformerFactory tf = TransformerFactory.newInstance();
+ Transformer serializer = tf.newTransformer();
+ // TODO set encoding from a command argument
+ serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+ serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
+ serializer.setOutputProperty( OutputKeys.METHOD, "html" );
+ serializer.transform( domSource, streamResult );
+ out.close();
+ }
+ catch ( Exception e )
+ {
+ e.printStackTrace();
+ }
+ }
+
+ static Document process( File docFile ) throws Exception
+ {
+ final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile );
+ WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor(
+ DocumentBuilderFactory.newInstance().newDocumentBuilder()
+ .newDocument() );
+ wordToHtmlExtractor.processDocument( wordDocument );
+ return wordToHtmlExtractor.getDocument();
+ }
+
+ private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
+
+ private final HtmlDocumentFacade htmlDocumentFacade;
+
+ /**
+ * Creates new instance of {@link WordToHtmlExtractor}. Can be used for
+ * output several {@link HWPFDocument}s into single HTML document.
+ *
+ * @param document
+ * XML DOM Document used as HTML document
+ */
+ public WordToHtmlExtractor( Document document )
+ {
+ this.htmlDocumentFacade = new HtmlDocumentFacade( document );
+ }
+
+ public Document getDocument()
+ {
+ return htmlDocumentFacade.getDocument();
+ }
+
+ @Override
+ protected void outputCharacters( Element pElement,
+ CharacterRun characterRun, String text )
+ {
+ Element span = htmlDocumentFacade.document.createElement( "span" );
+ pElement.appendChild( span );
+
+ StringBuilder style = new StringBuilder();
+ BlockProperies blockProperies = this.blocksProperies.peek();
+ if ( characterRun.getFontName() != null
+ && !WordToHtmlUtils.equals( characterRun.getFontName(),
+ blockProperies.pFontName ) )
+ {
+ style.append( "font-family: " + characterRun.getFontName() + "; " );
+ }
+ if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
+ {
+ style.append( "font-size: " + characterRun.getFontSize() / 2 + "; " );
+ }
+
+ WordToHtmlUtils.addCharactersProperties( characterRun, style );
+ if ( style.length() != 0 )
+ span.setAttribute( "style", style.toString() );
+
+ Text textNode = htmlDocumentFacade.createText( text );
+ span.appendChild( textNode );
+ }
+
+ protected void processHyperlink( HWPFDocumentCore wordDocument,
+ Element currentBlock, Paragraph paragraph,
+ List<CharacterRun> characterRuns, int currentTableLevel,
+ String hyperlink, int beginTextInclusive, int endTextExclusive )
+ {
+ Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink );
+ currentBlock.appendChild( basicLink );
+
+ if ( beginTextInclusive < endTextExclusive )
+ processCharacters( wordDocument, currentTableLevel, paragraph,
+ basicLink, characterRuns, beginTextInclusive,
+ endTextExclusive );
+ }
+
+ /**
+ * This method shall store image bytes in external file and convert it if
+ * necessary. Images shall be stored using PNG format. Other formats may be
+ * not supported by user browser.
+ * <p>
+ * Please note the
+ * {@link WordToHtmlUtils#setPictureProperties(Picture, Element)} method.
+ *
+ * @param currentBlock
+ * currently processed HTML element, like <tt>p</tt>. Shall be
+ * used as parent of newly created <tt>img</tt>
+ * @param inlined
+ * if image is inlined
+ * @param picture
+ * HWPF object, contained picture data and properties
+ */
+ protected void processImage( Element currentBlock, boolean inlined,
+ Picture picture )
+ {
+ // no default implementation -- skip
+ currentBlock.appendChild( htmlDocumentFacade.document
+ .createComment( "Image link to '"
+ + picture.suggestFullFileName() + "' can be here" ) );
+ }
+
+ protected void processPageref( HWPFDocumentCore hwpfDocument,
+ Element currentBlock, Paragraph paragraph,
+ List<CharacterRun> characterRuns, int currentTableLevel,
+ String pageref, int beginTextInclusive, int endTextExclusive )
+ {
+ Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref );
+ currentBlock.appendChild( basicLink );
+
+ if ( beginTextInclusive < endTextExclusive )
+ processCharacters( hwpfDocument, currentTableLevel, paragraph,
+ basicLink, characterRuns, beginTextInclusive,
+ endTextExclusive );
+ }
+
+ protected void processParagraph( HWPFDocumentCore hwpfDocument,
+ Element parentFopElement, int currentTableLevel,
+ Paragraph paragraph, String bulletText )
+ {
+ final Element pElement = htmlDocumentFacade.createParagraph();
+ parentFopElement.appendChild( pElement );
+
+ StringBuilder style = new StringBuilder();
+ WordToHtmlUtils.addParagraphProperties( paragraph, style );
+
+ final int charRuns = paragraph.numCharacterRuns();
+
+ if ( charRuns == 0 )
+ {
+ return;
+ }
+
+ {
+ final String pFontName;
+ final int pFontSize;
+ final CharacterRun characterRun = paragraph.getCharacterRun( 0 );
+ if ( characterRun != null )
+ {
+ pFontSize = characterRun.getFontSize() / 2;
+ pFontName = characterRun.getFontName();
+ WordToHtmlUtils.addFontFamily( pFontName, style );
+ WordToHtmlUtils.addFontSize( pFontSize, style );
+ }
+ else
+ {
+ pFontSize = -1;
+ pFontName = WordToHtmlUtils.EMPTY;
+ }
+ blocksProperies.push( new BlockProperies( pFontName, pFontSize ) );
+ }
+ try
+ {
+ if ( WordToHtmlUtils.isNotEmpty( bulletText ) )
+ {
+ Text textNode = htmlDocumentFacade.createText( bulletText );
+ pElement.appendChild( textNode );
+ }
+
+ List<CharacterRun> characterRuns = WordToHtmlUtils
+ .findCharacterRuns( paragraph );
+ processCharacters( hwpfDocument, currentTableLevel, paragraph,
+ pElement, characterRuns, 0, characterRuns.size() );
+ }
+ finally
+ {
+ blocksProperies.pop();
+ }
+
+ if ( style.length() > 0 )
+ pElement.setAttribute( "style", style.toString() );
+
+ return;
+ }
+
+ protected void processSection( HWPFDocumentCore wordDocument,
+ Section section, int sectionCounter )
+ {
+ Element div = htmlDocumentFacade.document.createElement( "div" );
+ div.setAttribute( "style", getSectionStyle( section ) );
+ htmlDocumentFacade.body.appendChild( div );
+
+ processSectionParagraphes( wordDocument, div, section, 0 );
+ }
+
+ @Override
+ protected void processSingleSection( HWPFDocumentCore wordDocument,
+ Section section )
+ {
+ htmlDocumentFacade.body.setAttribute( "style",
+ getSectionStyle( section ) );
+
+ processSectionParagraphes( wordDocument, htmlDocumentFacade.body,
+ section, 0 );
+ }
+
+ protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+ Table table, int thisTableLevel )
+ {
+ Element tableHeader = htmlDocumentFacade.createTableHeader();
+ Element tableBody = htmlDocumentFacade.createTableBody();
+
+ final int tableRows = table.numRows();
+
+ int maxColumns = Integer.MIN_VALUE;
+ for ( int r = 0; r < tableRows; r++ )
+ {
+ maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );
+ }
+
+ for ( int r = 0; r < tableRows; r++ )
+ {
+ TableRow tableRow = table.getRow( r );
+
+ Element tableRowElement = htmlDocumentFacade.createTableRow();
+ StringBuilder tableRowStyle = new StringBuilder();
+ WordToHtmlUtils.addTableRowProperties( tableRow, tableRowStyle );
+
+ final int rowCells = tableRow.numCells();
+ for ( int c = 0; c < rowCells; c++ )
+ {
+ TableCell tableCell = tableRow.getCell( c );
+
+ if ( tableCell.isMerged() && !tableCell.isFirstMerged() )
+ continue;
+
+ if ( tableCell.isVerticallyMerged()
+ && !tableCell.isFirstVerticallyMerged() )
+ continue;
+
+ Element tableCellElement;
+ if ( tableRow.isTableHeader() )
+ {
+ tableCellElement = htmlDocumentFacade
+ .createTableHeaderCell();
+ }
+ else
+ {
+ tableCellElement = htmlDocumentFacade.createTableCell();
+ }
+ StringBuilder tableCellStyle = new StringBuilder();
+ WordToHtmlUtils.addTableCellProperties( tableRow, tableCell,
+ r == 0, r == tableRows - 1, c == 0, c == rowCells - 1,
+ tableCellStyle );
+
+ if ( tableCell.isFirstMerged() )
+ {
+ int count = 0;
+ for ( int c1 = c; c1 < rowCells; c1++ )
+ {
+ TableCell nextCell = tableRow.getCell( c1 );
+ if ( nextCell.isMerged() )
+ count++;
+ if ( !nextCell.isMerged() )
+ break;
+ }
+ tableCellElement.setAttribute( "colspan", "" + count );
+ }
+ else
+ {
+ if ( c == rowCells - 1 && c != maxColumns - 1 )
+ {
+ tableCellElement.setAttribute( "colspan", ""
+ + ( maxColumns - c ) );
+ }
+ }
+
+ if ( tableCell.isFirstVerticallyMerged() )
+ {
+ int count = 0;
+ for ( int r1 = r; r1 < tableRows; r1++ )
+ {
+ TableRow nextRow = table.getRow( r1 );
+ if ( nextRow.numCells() < c )
+ break;
+ TableCell nextCell = nextRow.getCell( c );
+ if ( nextCell.isVerticallyMerged() )
+ count++;
+ if ( !nextCell.isVerticallyMerged() )
+ break;
+ }
+ tableCellElement.setAttribute( "rowspan", "" + count );
+ }
+
+ processSectionParagraphes( hwpfDocument, tableCellElement,
+ tableCell, thisTableLevel );
+
+ if ( !tableCellElement.hasChildNodes() )
+ {
+ tableCellElement.appendChild( htmlDocumentFacade
+ .createParagraph() );
+ }
+ if ( tableCellStyle.length() > 0 )
+ tableCellElement.setAttribute( "style",
+ tableCellStyle.toString() );
+
+ tableRowElement.appendChild( tableCellElement );
+ }
+
+ if ( tableRowStyle.length() > 0 )
+ tableRowElement
+ .setAttribute( "style", tableRowStyle.toString() );
+
+ if ( tableRow.isTableHeader() )
+ {
+ tableHeader.appendChild( tableRowElement );
+ }
+ else
+ {
+ tableBody.appendChild( tableRowElement );
+ }
+
+ }
+
+ final Element tableElement = htmlDocumentFacade.createTable();
+ if ( tableHeader.hasChildNodes() )
+ {
+ tableElement.appendChild( tableHeader );
+ }
+ if ( tableBody.hasChildNodes() )
+ {
+ tableElement.appendChild( tableBody );
+ flow.appendChild( tableElement );
+ }
+ else
+ {
+ logger.log(
+ POILogger.WARN,
+ "Table without body starting on offset "
+ + table.getStartOffset() + " -- "
+ + table.getEndOffset() );
+ }
+ }
+
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import org.apache.poi.hwpf.usermodel.BorderCode;
+import org.apache.poi.hwpf.usermodel.CharacterProperties;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.w3c.dom.Element;
+
+public class WordToHtmlUtils extends AbstractWordUtils
+{
+ public static void addBold( final boolean bold, StringBuilder style )
+ {
+ style.append( "font-weight: " + ( bold ? "bold" : "normal" ) + ";" );
+ }
+
+ public static void addBorder( BorderCode borderCode, String where,
+ StringBuilder style )
+ {
+ if ( borderCode == null || borderCode.getBorderType() == 0 )
+ return;
+
+ if ( isEmpty( where ) )
+ {
+ style.append( "border-style: " + getBorderType( borderCode ) + "; " );
+ style.append( "border-color: " + getColor( borderCode.getColor() )
+ + "; " );
+ style.append( "border-width: " + getBorderWidth( borderCode )
+ + "; " );
+ }
+ else
+ {
+ style.append( "border-" + where + "-style: "
+ + getBorderType( borderCode ) + "; " );
+ style.append( "border-" + where + "-color: "
+ + getColor( borderCode.getColor() ) + "; " );
+ style.append( "border-" + where + "-width: "
+ + getBorderWidth( borderCode ) + "; " );
+ }
+ }
+
+ public static void addCharactersProperties(
+ final CharacterRun characterRun, StringBuilder style )
+ {
+ final CharacterProperties clonedProperties = characterRun
+ .cloneProperties();
+
+ if ( characterRun.isBold() )
+ {
+ style.append( "font-weight: bold; " );
+ }
+ if ( characterRun.isItalic() )
+ {
+ style.append( "font-style: italic; " );
+ }
+
+ addBorder( clonedProperties.getBrc(), EMPTY, style );
+
+ if ( characterRun.isCapitalized() )
+ {
+ style.append( "text-transform: uppercase; " );
+ }
+ if ( characterRun.isHighlighted() )
+ {
+ style.append( "background-color: "
+ + getColor( clonedProperties.getIcoHighlight() ) + "; " );
+ }
+ if ( characterRun.isStrikeThrough() )
+ {
+ style.append( "text-decoration: line-through; " );
+ }
+ if ( characterRun.isShadowed() )
+ {
+ style.append( "text-shadow: " + characterRun.getFontSize() / 24
+ + "pt; " );
+ }
+ if ( characterRun.isSmallCaps() )
+ {
+ style.append( "font-variant: small-caps; " );
+ }
+ if ( characterRun.getSubSuperScriptIndex() == 1 )
+ {
+ style.append( "baseline-shift: super; " );
+ style.append( "font-size: smaller; " );
+ }
+ if ( characterRun.getSubSuperScriptIndex() == 2 )
+ {
+ style.append( "baseline-shift: sub; " );
+ style.append( "font-size: smaller; " );
+ }
+ if ( characterRun.getUnderlineCode() > 0 )
+ {
+ style.append( "text-decoration: underline; " );
+ }
+ if ( characterRun.isVanished() )
+ {
+ style.append( "visibility: hidden; " );
+ }
+ }
+
+ public static void addFontFamily( final String fontFamily,
+ StringBuilder style )
+ {
+ if ( isEmpty( fontFamily ) )
+ return;
+
+ style.append( "font-family: " + fontFamily );
+ }
+
+ public static void addFontSize( final int fontSize, StringBuilder style )
+ {
+ style.append( "font-size: " + fontSize );
+ }
+
+ public static void addIndent( Paragraph paragraph, StringBuilder style )
+ {
+ addIndent( style, "text-indent", paragraph.getFirstLineIndent() );
+ addIndent( style, "start-indent", paragraph.getIndentFromLeft() );
+ addIndent( style, "end-indent", paragraph.getIndentFromRight() );
+ addIndent( style, "space-before", paragraph.getSpacingBefore() );
+ addIndent( style, "space-after", paragraph.getSpacingAfter() );
+ }
+
+ private static void addIndent( StringBuilder style, final String cssName,
+ final int twipsValue )
+ {
+ if ( twipsValue == 0 )
+ return;
+
+ style.append( cssName + ": " + ( twipsValue / TWIPS_PER_PT ) + "pt; " );
+ }
+
+ public static void addJustification( Paragraph paragraph,
+ final StringBuilder style )
+ {
+ String justification = getJustification( paragraph.getJustification() );
+ if ( isNotEmpty( justification ) )
+ style.append( "text-align: " + justification + "; " );
+ }
+
+ public static void addParagraphProperties( Paragraph paragraph,
+ StringBuilder style )
+ {
+ addIndent( paragraph, style );
+ addJustification( paragraph, style );
+
+ addBorder( paragraph.getBottomBorder(), "bottom", style );
+ addBorder( paragraph.getLeftBorder(), "left", style );
+ addBorder( paragraph.getRightBorder(), "right", style );
+ addBorder( paragraph.getTopBorder(), "top", style );
+
+ if ( paragraph.pageBreakBefore() )
+ {
+ style.append( "break-before: page; " );
+ }
+
+ style.append( "hyphenate: " + paragraph.isAutoHyphenated() + "; " );
+
+ if ( paragraph.keepOnPage() )
+ {
+ style.append( "keep-together.within-page: always; " );
+ }
+
+ if ( paragraph.keepWithNext() )
+ {
+ style.append( "keep-with-next.within-page: always; " );
+ }
+
+ style.append( "linefeed-treatment: preserve; " );
+ style.append( "white-space-collapse: false; " );
+ }
+
+ public static void addTableCellProperties( TableRow tableRow,
+ TableCell tableCell, boolean toppest, boolean bottomest,
+ boolean leftest, boolean rightest, StringBuilder style )
+ {
+ style.append( "width: " + ( tableCell.getWidth() / TWIPS_PER_INCH )
+ + "in; " );
+ style.append( "padding-start: "
+ + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " );
+ style.append( "padding-end: "
+ + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " );
+
+ BorderCode top = tableCell.getBrcTop() != null
+ && tableCell.getBrcTop().getBorderType() != 0 ? tableCell
+ .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow
+ .getHorizontalBorder();
+ BorderCode bottom = tableCell.getBrcBottom() != null
+ && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell
+ .getBrcBottom() : bottomest ? tableRow.getBottomBorder()
+ : tableRow.getHorizontalBorder();
+
+ BorderCode left = tableCell.getBrcLeft() != null
+ && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell
+ .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
+ .getVerticalBorder();
+ BorderCode right = tableCell.getBrcRight() != null
+ && tableCell.getBrcRight().getBorderType() != 0 ? tableCell
+ .getBrcRight() : rightest ? tableRow.getRightBorder()
+ : tableRow.getVerticalBorder();
+
+ addBorder( bottom, "bottom", style );
+ addBorder( left, "left", style );
+ addBorder( right, "right", style );
+ addBorder( top, "top", style );
+ }
+
+ public static void addTableRowProperties( TableRow tableRow,
+ StringBuilder style )
+ {
+ if ( tableRow.getRowHeight() > 0 )
+ {
+ style.append( "height: "
+ + ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in; " );
+ }
+ if ( !tableRow.cantSplit() )
+ {
+ style.append( "keep-together: always; " );
+ }
+ }
+
+ public static void setPictureProperties( Picture picture,
+ Element graphicElement )
+ {
+ final int aspectRatioX = picture.getAspectRatioX();
+ final int aspectRatioY = picture.getAspectRatioY();
+
+ if ( aspectRatioX > 0 )
+ {
+ graphicElement
+ .setAttribute( "content-width", ( ( picture.getDxaGoal()
+ * aspectRatioX / 100 ) / TWIPS_PER_PT )
+ + "pt" );
+ }
+ else
+ graphicElement.setAttribute( "content-width",
+ ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" );
+
+ if ( aspectRatioY > 0 )
+ graphicElement
+ .setAttribute( "content-height", ( ( picture.getDyaGoal()
+ * aspectRatioY / 100 ) / TWIPS_PER_PT )
+ + "pt" );
+ else
+ graphicElement.setAttribute( "content-height",
+ ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" );
+
+ if ( aspectRatioX <= 0 || aspectRatioY <= 0 )
+ {
+ graphicElement.setAttribute( "scaling", "uniform" );
+ }
+ else
+ {
+ graphicElement.setAttribute( "scaling", "non-uniform" );
+ }
+
+ graphicElement.setAttribute( "vertical-align", "text-bottom" );
+
+ if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
+ || picture.getDyaCropBottom() != 0
+ || picture.getDxaCropLeft() != 0 )
+ {
+ int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT;
+ int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT;
+ int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT;
+ int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT;
+ graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, "
+ + rectRight + "pt, " + rectBottom + "pt, " + rectLeft
+ + "pt)" );
+ graphicElement.setAttribute( "oveerflow", "hidden" );
+ }
+ }
+
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.StringWriter;
+import java.util.Arrays;
+import java.util.List;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.hwpf.HWPFDocumentCore;
+
+public class TestWordToExtractorSuite
+{
+ /**
+ * YK: a quick hack to exclude failing documents from the suite.
+ */
+ private static List<String> failingFiles = Arrays.asList();
+
+ public static Test suite()
+ {
+ TestSuite suite = new TestSuite();
+
+ File directory = POIDataSamples.getDocumentInstance().getFile(
+ "../document" );
+ for ( final File child : directory.listFiles( new FilenameFilter()
+ {
+ public boolean accept( File dir, String name )
+ {
+ return name.endsWith( ".doc" ) && !failingFiles.contains( name );
+ }
+ } ) )
+ {
+ final String name = child.getName();
+
+ suite.addTest( new TestCase( name + " [FO]" )
+ {
+ public void runTest() throws Exception
+ {
+ test( child, false );
+ }
+ } );
+ suite.addTest( new TestCase( name + " [HTML]" )
+ {
+ public void runTest() throws Exception
+ {
+ test( child, true );
+ }
+ } );
+
+ }
+
+ return suite;
+ }
+
+ protected static void test( File child, boolean html ) throws Exception
+ {
+ HWPFDocumentCore hwpfDocument;
+ try
+ {
+ hwpfDocument = AbstractWordUtils.loadDoc( child );
+ }
+ catch ( Exception exc )
+ {
+ // unable to parse file -- not WordToFoExtractor fault
+ return;
+ }
+
+ WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
+ DocumentBuilderFactory.newInstance().newDocumentBuilder()
+ .newDocument() );
+ wordToFoExtractor.processDocument( hwpfDocument );
+
+ StringWriter stringWriter = new StringWriter();
+
+ Transformer transformer = TransformerFactory.newInstance()
+ .newTransformer();
+ transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
+ transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
+ transformer.transform(
+ new DOMSource( wordToFoExtractor.getDocument() ),
+ new StreamResult( stringWriter ) );
+
+ if ( html )
+ transformer.setOutputProperty( OutputKeys.METHOD, "html" );
+
+ // no exceptions
+ }
+}
+++ /dev/null
-package org.apache.poi.hwpf.extractor;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FilenameFilter;
-import java.io.StringWriter;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-
-import org.apache.poi.EncryptedDocumentException;
-
-import org.apache.poi.hwpf.OldWordFileFormatException;
-
-import junit.framework.Test;
-import junit.framework.TestCase;
-import junit.framework.TestSuite;
-import org.apache.poi.POIDataSamples;
-import org.apache.poi.hwpf.HWPFDocument;
-
-public class TestWordToFoExtractorSuite
-{
- /**
- * YK: a quick hack to exclude failing documents from the suite.
- *
- * WordToFoExtractor stumbles on Bug33519.doc with a NPE
- */
- private static List<String> failingFiles = Arrays.asList("Bug33519.doc");
-
- public static Test suite() {
- TestSuite suite = new TestSuite();
-
- File directory = POIDataSamples.getDocumentInstance().getFile(
- "../document");
- for (final File child : directory.listFiles(new FilenameFilter() {
- public boolean accept(File dir, String name) {
- return name.endsWith(".doc") && !failingFiles.contains(name);
- }
- })) {
- final String name = child.getName();
- suite.addTest(new TestCase(name) {
- public void runTest() throws Exception {
- test(child);
- }
- });
- }
-
- return suite;
- }
-
- protected static void test( File child ) throws Exception
- {
- HWPFDocument hwpfDocument;
- FileInputStream fileInputStream = new FileInputStream( child );
- try
- {
- hwpfDocument = new HWPFDocument( fileInputStream );
- }
- catch ( Exception exc )
- {
- // unable to parse file -- not WordToFoExtractor fault
- return;
- }
- finally
- {
- fileInputStream.close();
- }
-
- WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
- DocumentBuilderFactory.newInstance().newDocumentBuilder()
- .newDocument() );
- wordToFoExtractor.processDocument( hwpfDocument );
-
- StringWriter stringWriter = new StringWriter();
-
- Transformer transformer = TransformerFactory.newInstance()
- .newTransformer();
- transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
- transformer.transform(
- new DOMSource( wordToFoExtractor.getDocument() ),
- new StreamResult( stringWriter ) );
- // no exceptions
- }
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.StringWriter;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import junit.framework.TestCase;
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.hwpf.HWPFDocument;
+
+/**
+ * Test cases for {@link WordToFoExtractor}
+ *
+ * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
+ */
+public class TestWordToHtmlExtractor extends TestCase
+{
+ private static String getHtmlText( final String sampleFileName )
+ throws Exception
+ {
+ HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples
+ .getDocumentInstance().openResourceAsStream( sampleFileName ) );
+
+ WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor(
+ DocumentBuilderFactory.newInstance().newDocumentBuilder()
+ .newDocument() );
+ wordToHtmlExtractor.processDocument( hwpfDocument );
+
+ StringWriter stringWriter = new StringWriter();
+
+ Transformer transformer = TransformerFactory.newInstance()
+ .newTransformer();
+ transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
+ transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
+ transformer.setOutputProperty( OutputKeys.METHOD, "html" );
+ transformer.transform(
+ new DOMSource( wordToHtmlExtractor.getDocument() ),
+ new StreamResult( stringWriter ) );
+
+ String result = stringWriter.toString();
+ return result;
+ }
+
+ public void testBug46610_2() throws Exception
+ {
+ String result = getHtmlText( "Bug46610_2.doc" );
+ assertTrue( result
+ .contains( "012345678911234567892123456789312345678941234567890123456789112345678921234567893123456789412345678" ) );
+ }
+
+ public void testEquation() throws Exception
+ {
+ String result = getHtmlText( "equation.doc" );
+
+ assertTrue( result
+ .contains( "<!--Image link to '0.emf' can be here-->" ) );
+ }
+
+ public void testHyperlink() throws Exception
+ {
+ String result = getHtmlText( "hyperlink.doc" );
+
+ assertTrue( result.contains( "<a href=\"http://testuri.org/\">" ) );
+ assertTrue( result.contains( "Hyperlink text" ) );
+ }
+
+ public void testPageref() throws Exception
+ {
+ String result = getHtmlText( "pageref.doc" );
+
+ assertTrue( result.contains( "<a href=\"#userref\">" ) );
+ assertTrue( result.contains( "1" ) );
+ }
+}