From: Sergey Vladimirov Date: Mon, 4 Jul 2011 19:14:44 +0000 (+0000) Subject: rename extractor -> converter and move to converter package X-Git-Tag: REL_3_8_BETA4~338 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=03035ed09af320b45ac65796fd34a9202792c862;p=poi.git rename extractor -> converter and move to converter package git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1142767 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java new file mode 100644 index 0000000000..e047d3f564 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java @@ -0,0 +1,365 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; +import org.apache.poi.hwpf.model.ListFormatOverride; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.usermodel.Section; +import org.apache.poi.hwpf.usermodel.Table; +import org.apache.poi.hwpf.usermodel.TableIterator; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.w3c.dom.Document; +import org.w3c.dom.Element; + +public abstract class AbstractWordConverter +{ + private static final byte BEL_MARK = 7; + + private static final byte FIELD_BEGIN_MARK = 19; + + private static final byte FIELD_END_MARK = 21; + + private static final byte FIELD_SEPARATOR_MARK = 20; + + private static final POILogger logger = POILogFactory + .getLogger( AbstractWordConverter.class ); + + public abstract Document getDocument(); + + protected abstract void outputCharacters( Element block, + CharacterRun characterRun, String text ); + + protected boolean processCharacters( HWPFDocumentCore hwpfDocument, + int currentTableLevel, Paragraph paragraph, final Element block, + List characterRuns, final int start, final int end ) + { + boolean haveAnyText = false; + + for ( int c = start; c < end; c++ ) + { + CharacterRun characterRun = characterRuns.get( c ); + + if ( characterRun == null ) + throw new AssertionError(); + + if ( hwpfDocument instanceof HWPFDocument + && ( (HWPFDocument) hwpfDocument ).getPicturesTable() + .hasPicture( characterRun ) ) + { + HWPFDocument newFormat = (HWPFDocument) hwpfDocument; + Picture picture = newFormat.getPicturesTable().extractPicture( + characterRun, true ); + + processImage( block, characterRun.text().charAt( 0 ) == 0x01, + picture ); + continue; + } + + String text = characterRun.text(); + if ( text.getBytes().length == 0 ) + continue; + + if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) + { + int skipTo = tryField( hwpfDocument, paragraph, + currentTableLevel, characterRuns, c, block ); + + if ( skipTo != c ) + { + c = skipTo; + continue; + } + + continue; + } + if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) + { + // shall not appear without FIELD_BEGIN_MARK + continue; + } + if ( text.getBytes()[0] == FIELD_END_MARK ) + { + // shall not appear without FIELD_BEGIN_MARK + continue; + } + + if ( characterRun.isSpecialCharacter() || characterRun.isObj() + || characterRun.isOle2() ) + { + continue; + } + + if ( text.endsWith( "\r" ) + || ( text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0 ) ) + text = text.substring( 0, text.length() - 1 ); + + outputCharacters( block, characterRun, text ); + + haveAnyText |= text.trim().length() != 0; + } + + return haveAnyText; + } + + public void processDocument( HWPFDocumentCore wordDocument ) + { + final Range range = wordDocument.getRange(); + for ( int s = 0; s < range.numSections(); s++ ) + { + processSection( wordDocument, range.getSection( s ), s ); + } + } + + protected void processField( HWPFDocumentCore wordDocument, + Element currentBlock, Paragraph paragraph, int currentTableLevel, + List characterRuns, int beginMark, int separatorMark, + int endMark ) + { + + Pattern hyperlinkPattern = Pattern + .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" ); + Pattern pagerefPattern = Pattern + .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" ); + + if ( separatorMark - beginMark > 1 ) + { + int index = beginMark + 1; + CharacterRun firstAfterBegin = null; + while ( index < separatorMark ) + { + firstAfterBegin = paragraph.getCharacterRun( index ); + if ( firstAfterBegin == null ) + { + logger.log( POILogger.WARN, + "Paragraph " + paragraph.getStartOffset() + "--" + + paragraph.getEndOffset() + + " contains null CharacterRun #" + index ); + index++; + continue; + } + break; + } + + if ( firstAfterBegin != null ) + { + final Matcher hyperlinkMatcher = hyperlinkPattern + .matcher( firstAfterBegin.text() ); + if ( hyperlinkMatcher.matches() ) + { + String hyperlink = hyperlinkMatcher.group( 1 ); + processHyperlink( wordDocument, currentBlock, paragraph, + characterRuns, currentTableLevel, hyperlink, + separatorMark + 1, endMark ); + return; + } + + final Matcher pagerefMatcher = pagerefPattern + .matcher( firstAfterBegin.text() ); + if ( pagerefMatcher.matches() ) + { + String pageref = pagerefMatcher.group( 1 ); + processPageref( wordDocument, currentBlock, paragraph, + characterRuns, currentTableLevel, pageref, + separatorMark + 1, endMark ); + return; + } + } + } + + StringBuilder debug = new StringBuilder( "Unsupported field type: \n" ); + for ( int i = beginMark; i <= endMark; i++ ) + { + debug.append( "\t" ); + debug.append( paragraph.getCharacterRun( i ) ); + debug.append( "\n" ); + } + logger.log( POILogger.WARN, debug ); + + // just output field value + if ( separatorMark + 1 < endMark ) + processCharacters( wordDocument, currentTableLevel, paragraph, + currentBlock, characterRuns, separatorMark + 1, endMark ); + + return; + } + + protected abstract void processHyperlink( HWPFDocumentCore wordDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String hyperlink, int i, int endMark ); + + protected abstract void processImage( Element currentBlock, + boolean inlined, Picture picture ); + + protected abstract void processPageref( HWPFDocumentCore wordDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String pageref, int beginTextInclusive, int endTextExclusive ); + + protected abstract void processParagraph( HWPFDocumentCore wordDocument, + Element parentFopElement, int currentTableLevel, + Paragraph paragraph, String bulletText ); + + protected abstract void processSection( HWPFDocumentCore wordDocument, + Section section, int s ); + + protected void processSectionParagraphes( HWPFDocumentCore wordDocument, + Element flow, Range range, int currentTableLevel ) + { + final Map allTables = new HashMap(); + for ( TableIterator tableIterator = AbstractWordUtils.newTableIterator( + range, currentTableLevel + 1 ); tableIterator.hasNext(); ) + { + Table next = tableIterator.next(); + allTables.put( Integer.valueOf( next.getStartOffset() ), next ); + } + + final ListTables listTables = wordDocument.getListTables(); + int currentListInfo = 0; + + final int paragraphs = range.numParagraphs(); + for ( int p = 0; p < paragraphs; p++ ) + { + Paragraph paragraph = range.getParagraph( p ); + + if ( allTables.containsKey( Integer.valueOf( paragraph + .getStartOffset() ) ) ) + { + Table table = allTables.get( Integer.valueOf( paragraph + .getStartOffset() ) ); + processTable( wordDocument, flow, table, currentTableLevel + 1 ); + continue; + } + + if ( paragraph.isInTable() + && paragraph.getTableLevel() != currentTableLevel ) + { + continue; + } + + if ( paragraph.getIlfo() != currentListInfo ) + { + currentListInfo = paragraph.getIlfo(); + } + + if ( currentListInfo != 0 ) + { + if ( listTables != null ) + { + final ListFormatOverride listFormatOverride = listTables + .getOverride( paragraph.getIlfo() ); + + String label = AbstractWordUtils.getBulletText( listTables, + paragraph, listFormatOverride.getLsid() ); + + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, label ); + } + else + { + logger.log( POILogger.WARN, + "Paragraph #" + paragraph.getStartOffset() + "-" + + paragraph.getEndOffset() + + " has reference to list structure #" + + currentListInfo + + ", but listTables not defined in file" ); + + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, AbstractWordUtils.EMPTY ); + } + } + else + { + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, AbstractWordUtils.EMPTY ); + } + } + + } + + protected void processSingleSection( HWPFDocumentCore wordDocument, + Section section ) + { + processSection( wordDocument, section, 0 ); + } + + protected abstract void processTable( HWPFDocumentCore wordDocument, + Element flow, Table table, int newTableLevel ); + + protected int tryField( HWPFDocumentCore wordDocument, Paragraph paragraph, + int currentTableLevel, List characterRuns, + int beginMark, Element currentBlock ) + { + int separatorMark = -1; + int endMark = -1; + for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ ) + { + CharacterRun characterRun = paragraph.getCharacterRun( c ); + + String text = characterRun.text(); + if ( text.getBytes().length == 0 ) + continue; + + if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) + { + if ( separatorMark != -1 ) + { + // double; + return beginMark; + } + + separatorMark = c; + continue; + } + + if ( text.getBytes()[0] == FIELD_END_MARK ) + { + if ( endMark != -1 ) + { + // double; + return beginMark; + } + + endMark = c; + break; + } + + } + + if ( separatorMark == -1 || endMark == -1 ) + return beginMark; + + processField( wordDocument, currentBlock, paragraph, currentTableLevel, + characterRuns, beginMark, separatorMark, endMark ); + + return endMark; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java new file mode 100644 index 0000000000..9bbbf73e3a --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java @@ -0,0 +1,404 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; +import org.apache.poi.hwpf.HWPFOldDocument; +import org.apache.poi.hwpf.OldWordFileFormatException; +import org.apache.poi.hwpf.model.CHPX; +import org.apache.poi.hwpf.model.ListLevel; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.usermodel.BorderCode; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.usermodel.Section; +import org.apache.poi.hwpf.usermodel.SectionProperties; +import org.apache.poi.hwpf.usermodel.TableIterator; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +public class AbstractWordUtils +{ + static final String EMPTY = ""; + + private static final POILogger logger = POILogFactory + .getLogger( AbstractWordUtils.class ); + + public static final float TWIPS_PER_INCH = 1440.0f; + public static final int TWIPS_PER_PT = 20; + + static void closeQuietly( final Closeable closeable ) + { + try + { + closeable.close(); + } + catch ( Exception exc ) + { + logger.log( POILogger.ERROR, "Unable to close resource: " + exc, + exc ); + } + } + + static boolean equals( String str1, String str2 ) + { + return str1 == null ? str2 == null : str1.equals( str2 ); + } + + // XXX incorporate into Range + static List findCharacterRuns( Range range ) + { + final int min = range.getStartOffset(); + final int max = range.getEndOffset(); + + List result = new ArrayList(); + List chpxs = getCharacters( range ); + for ( int i = 0; i < chpxs.size(); i++ ) + { + CHPX chpx = chpxs.get( i ); + if ( chpx == null ) + continue; + + if ( Math.max( min, chpx.getStart() ) <= Math.min( max, + chpx.getEnd() ) ) + { + final CharacterRun characterRun = getCharacterRun( range, chpx ); + + if ( characterRun == null ) + continue; + + result.add( characterRun ); + } + } + + return result; + } + + public static String getBorderType( BorderCode borderCode ) + { + if ( borderCode == null ) + throw new IllegalArgumentException( "borderCode is null" ); + + switch ( borderCode.getBorderType() ) + { + case 1: + case 2: + return "solid"; + case 3: + return "double"; + case 5: + return "solid"; + case 6: + return "dotted"; + case 7: + case 8: + return "dashed"; + case 9: + return "dotted"; + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + case 16: + case 17: + case 18: + case 19: + return "double"; + case 20: + return "solid"; + case 21: + return "double"; + case 22: + return "dashed"; + case 23: + return "dashed"; + case 24: + return "ridge"; + case 25: + return "grooved"; + default: + return "solid"; + } + } + + public static String getBorderWidth( BorderCode borderCode ) + { + int lineWidth = borderCode.getLineWidth(); + int pt = lineWidth / 8; + int pte = lineWidth - pt * 8; + + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append( pt ); + stringBuilder.append( "." ); + stringBuilder.append( 1000 / 8 * pte ); + stringBuilder.append( "pt" ); + return stringBuilder.toString(); + } + + public static String getBulletText( ListTables listTables, + Paragraph paragraph, int listId ) + { + final ListLevel listLevel = listTables.getLevel( listId, + paragraph.getIlvl() ); + + if ( listLevel.getNumberText() == null ) + return EMPTY; + + StringBuffer bulletBuffer = new StringBuffer(); + char[] xst = listLevel.getNumberText().toCharArray(); + for ( char element : xst ) + { + if ( element < 9 ) + { + ListLevel numLevel = listTables.getLevel( listId, element ); + + int num = numLevel.getStartAt(); + bulletBuffer.append( NumberFormatter.getNumber( num, + listLevel.getNumberFormat() ) ); + + if ( numLevel == listLevel ) + { + numLevel.setStartAt( numLevel.getStartAt() + 1 ); + } + + } + else + { + bulletBuffer.append( element ); + } + } + + byte follow = getIxchFollow( listLevel ); + switch ( follow ) + { + case 0: + bulletBuffer.append( "\t" ); + break; + case 1: + bulletBuffer.append( " " ); + break; + default: + break; + } + + return bulletBuffer.toString(); + } + + private static CharacterRun getCharacterRun( Range range, CHPX chpx ) + { + try + { + Method method = Range.class.getDeclaredMethod( "getCharacterRun", + CHPX.class ); + method.setAccessible( true ); + return (CharacterRun) method.invoke( range, chpx ); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + + private static List getCharacters( Range range ) + { + try + { + Field field = Range.class.getDeclaredField( "_characters" ); + field.setAccessible( true ); + return (List) field.get( range ); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + + public static String getColor( int ico ) + { + switch ( ico ) + { + case 1: + return "black"; + case 2: + return "blue"; + case 3: + return "cyan"; + case 4: + return "green"; + case 5: + return "magenta"; + case 6: + return "red"; + case 7: + return "yellow"; + case 8: + return "white"; + case 9: + return "darkblue"; + case 10: + return "darkcyan"; + case 11: + return "darkgreen"; + case 12: + return "darkmagenta"; + case 13: + return "darkred"; + case 14: + return "darkyellow"; + case 15: + return "darkgray"; + case 16: + return "lightgray"; + default: + return "black"; + } + } + + public static byte getIxchFollow( ListLevel listLevel ) + { + try + { + Field field = ListLevel.class.getDeclaredField( "_ixchFollow" ); + field.setAccessible( true ); + return ( (Byte) field.get( listLevel ) ).byteValue(); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + + public static String getJustification( int js ) + { + switch ( js ) + { + case 0: + return "start"; + case 1: + return "center"; + case 2: + return "end"; + case 3: + case 4: + return "justify"; + case 5: + return "center"; + case 6: + return "left"; + case 7: + return "start"; + case 8: + return "end"; + case 9: + return "justify"; + } + return ""; + } + + public static String getListItemNumberLabel( int number, int format ) + { + + if ( format != 0 ) + System.err.println( "NYI: toListItemNumberLabel(): " + format ); + + return String.valueOf( number ); + } + + public static SectionProperties getSectionProperties( Section section ) + { + try + { + Field field = Section.class.getDeclaredField( "_props" ); + field.setAccessible( true ); + return (SectionProperties) field.get( section ); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + + static boolean isEmpty( String str ) + { + return str == null || str.length() == 0; + } + + static boolean isNotEmpty( String str ) + { + return !isEmpty( str ); + } + + public static HWPFDocumentCore loadDoc( File docFile ) throws IOException + { + final FileInputStream istream = new FileInputStream( docFile ); + try + { + return loadDoc( istream ); + } + finally + { + closeQuietly( istream ); + } + } + + public static HWPFDocumentCore loadDoc( InputStream inputStream ) + throws IOException + { + final POIFSFileSystem poifsFileSystem = HWPFDocumentCore + .verifyAndBuildPOIFS( inputStream ); + try + { + return new HWPFDocument( poifsFileSystem ); + } + catch ( OldWordFileFormatException exc ) + { + return new HWPFOldDocument( poifsFileSystem ); + } + } + + public static TableIterator newTableIterator( Range range, int level ) + { + try + { + Constructor constructor = TableIterator.class + .getDeclaredConstructor( Range.class, int.class ); + constructor.setAccessible( true ); + return constructor.newInstance( range, Integer.valueOf( level ) ); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/FoDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/FoDocumentFacade.java new file mode 100644 index 0000000000..8a944a9ee9 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/FoDocumentFacade.java @@ -0,0 +1,201 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +public class FoDocumentFacade +{ + private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format"; + + protected final Document document; + protected final Element layoutMasterSet; + protected final Element root; + + public FoDocumentFacade( Document document ) + { + this.document = document; + + root = document.createElementNS( NS_XSLFO, "fo:root" ); + document.appendChild( root ); + + layoutMasterSet = document.createElementNS( NS_XSLFO, + "fo:layout-master-set" ); + root.appendChild( layoutMasterSet ); + } + + public Element addFlowToPageSequence( final Element pageSequence, + String flowName ) + { + final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" ); + flow.setAttribute( "flow-name", flowName ); + pageSequence.appendChild( flow ); + + return flow; + } + + public Element addListItem( Element listBlock ) + { + Element result = createListItem(); + listBlock.appendChild( result ); + return result; + } + + public Element addListItemBody( Element listItem ) + { + Element result = createListItemBody(); + listItem.appendChild( result ); + return result; + } + + public Element addListItemLabel( Element listItem, String text ) + { + Element result = createListItemLabel( text ); + listItem.appendChild( result ); + return result; + } + + public Element addPageSequence( String pageMaster ) + { + final Element pageSequence = document.createElementNS( NS_XSLFO, + "fo:page-sequence" ); + pageSequence.setAttribute( "master-reference", pageMaster ); + root.appendChild( pageSequence ); + return pageSequence; + } + + public Element addRegionBody( Element pageMaster ) + { + final Element regionBody = document.createElementNS( NS_XSLFO, + "fo:region-body" ); + pageMaster.appendChild( regionBody ); + + return regionBody; + } + + public Element addSimplePageMaster( String masterName ) + { + final Element simplePageMaster = document.createElementNS( NS_XSLFO, + "fo:simple-page-master" ); + simplePageMaster.setAttribute( "master-name", masterName ); + layoutMasterSet.appendChild( simplePageMaster ); + + return simplePageMaster; + } + + protected Element createBasicLinkExternal( String externalDestination ) + { + final Element basicLink = document.createElementNS( NS_XSLFO, + "fo:basic-link" ); + basicLink.setAttribute( "external-destination", externalDestination ); + return basicLink; + } + + public Element createBasicLinkInternal( String internalDestination ) + { + final Element basicLink = document.createElementNS( NS_XSLFO, + "fo:basic-link" ); + basicLink.setAttribute( "internal-destination", internalDestination ); + return basicLink; + } + + public Element createBlock() + { + return document.createElementNS( NS_XSLFO, "fo:block" ); + } + + public Element createExternalGraphic( String source ) + { + Element result = document.createElementNS( NS_XSLFO, + "fo:external-graphic" ); + result.setAttribute( "src", "url('" + source + "')" ); + return result; + } + + public Element createInline() + { + return document.createElementNS( NS_XSLFO, "fo:inline" ); + } + + public Element createLeader() + { + return document.createElementNS( NS_XSLFO, "fo:leader" ); + } + + public Element createListBlock() + { + return document.createElementNS( NS_XSLFO, "fo:list-block" ); + } + + public Element createListItem() + { + return document.createElementNS( NS_XSLFO, "fo:list-item" ); + } + + public Element createListItemBody() + { + return document.createElementNS( NS_XSLFO, "fo:list-item-body" ); + } + + public Element createListItemLabel( String text ) + { + Element result = document.createElementNS( NS_XSLFO, + "fo:list-item-label" ); + Element block = createBlock(); + block.appendChild( document.createTextNode( text ) ); + result.appendChild( block ); + return result; + } + + protected Element createTable() + { + return document.createElementNS( NS_XSLFO, "fo:table" ); + } + + protected Element createTableBody() + { + return document.createElementNS( NS_XSLFO, "fo:table-body" ); + } + + protected Element createTableCell() + { + return document.createElementNS( NS_XSLFO, "fo:table-cell" ); + } + + protected Element createTableHeader() + { + return document.createElementNS( NS_XSLFO, "fo:table-header" ); + } + + protected Element createTableRow() + { + return document.createElementNS( NS_XSLFO, "fo:table-row" ); + } + + protected Text createText( String data ) + { + return document.createTextNode( data ); + } + + public Document getDocument() + { + return document; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java new file mode 100644 index 0000000000..506ba6d539 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java @@ -0,0 +1,107 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +public class HtmlDocumentFacade +{ + + protected final Element body; + protected final Document document; + protected final Element head; + protected final Element html; + + public HtmlDocumentFacade( Document document ) + { + this.document = document; + + html = document.createElement( "html" ); + document.appendChild( html ); + + body = document.createElement( "body" ); + head = document.createElement( "head" ); + + html.appendChild( head ); + html.appendChild( body ); + } + + public Element createHyperlink( String internalDestination ) + { + final Element basicLink = document.createElement( "a" ); + basicLink.setAttribute( "href", internalDestination ); + return basicLink; + } + + public Element createListItem() + { + return document.createElement( "li" ); + } + + public Element createParagraph() + { + return document.createElement( "p" ); + } + + public Element createTable() + { + return document.createElement( "table" ); + } + + public Element createTableBody() + { + return document.createElement( "tbody" ); + } + + public Element createTableCell() + { + return document.createElement( "td" ); + } + + public Element createTableHeader() + { + return document.createElement( "thead" ); + } + + public Element createTableHeaderCell() + { + return document.createElement( "th" ); + } + + public Element createTableRow() + { + return document.createElement( "tr" ); + } + + public Text createText( String data ) + { + return document.createTextNode( data ); + } + + public Element createUnorderedList() + { + return document.createElement( "ul" ); + } + + public Document getDocument() + { + return document; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/NumberFormatter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/NumberFormatter.java new file mode 100644 index 0000000000..9897cfb0c1 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/NumberFormatter.java @@ -0,0 +1,67 @@ +/* + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ + +package org.apache.poi.hwpf.converter; + +/** + * Comment me + * + * @author Ryan Ackley + */ +public final class NumberFormatter +{ + + private static String[] C_LETTERS = new String[] { "a", "b", "c", "d", "e", + "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", + "s", "t", "u", "v", "x", "y", "z" }; + + private static String[] C_ROMAN = new String[] { "i", "ii", "iii", "iv", + "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", "xiii", "xiv", + "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii", "xxiii", + "xxiv", "xxv", "xxvi", "xxvii", "xxviii", "xxix", "xxx", "xxxi", + "xxxii", "xxxiii", "xxxiv", "xxxv", "xxxvi", "xxxvii", "xxxvii", + "xxxviii", "xxxix", "xl", "xli", "xlii", "xliii", "xliv", "xlv", + "xlvi", "xlvii", "xlviii", "xlix", "l" }; + + private final static int T_ARABIC = 0; + private final static int T_LOWER_LETTER = 4; + private final static int T_LOWER_ROMAN = 2; + private final static int T_ORDINAL = 5; + private final static int T_UPPER_LETTER = 3; + private final static int T_UPPER_ROMAN = 1; + + public static String getNumber( int num, int style ) + { + switch ( style ) + { + case T_UPPER_ROMAN: + return C_ROMAN[num - 1].toUpperCase(); + case T_LOWER_ROMAN: + return C_ROMAN[num - 1]; + case T_UPPER_LETTER: + return C_LETTERS[num - 1].toUpperCase(); + case T_LOWER_LETTER: + return C_LETTERS[num - 1]; + case T_ARABIC: + case T_ORDINAL: + default: + return String.valueOf( num ); + } + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java new file mode 100644 index 0000000000..5744a7b7ef --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java @@ -0,0 +1,626 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import java.io.File; +import java.io.FileWriter; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; +import org.apache.poi.hwpf.model.ListFormatOverride; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.usermodel.BorderCode; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.usermodel.Section; +import org.apache.poi.hwpf.usermodel.SectionProperties; +import org.apache.poi.hwpf.usermodel.Table; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.hwpf.usermodel.TableIterator; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +/** + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ +public class WordToFoConverter extends AbstractWordConverter +{ + + /** + * Holds properties values, applied to current fo:block element. + * Those properties shall not be doubled in children fo:inline + * elements. + */ + private static class BlockProperies + { + final boolean pBold; + final String pFontName; + final int pFontSize; + final boolean pItalic; + + public BlockProperies( String pFontName, int pFontSize, boolean pBold, + boolean pItalic ) + { + this.pFontName = pFontName; + this.pFontSize = pFontSize; + this.pBold = pBold; + this.pItalic = pItalic; + } + } + + private static final POILogger logger = POILogFactory + .getLogger( WordToFoConverter.class ); + + public static String getBorderType( BorderCode borderCode ) + { + if ( borderCode == null ) + throw new IllegalArgumentException( "borderCode is null" ); + + switch ( borderCode.getBorderType() ) + { + case 1: + case 2: + return "solid"; + case 3: + return "double"; + case 5: + return "solid"; + case 6: + return "dotted"; + case 7: + case 8: + return "dashed"; + case 9: + return "dotted"; + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + case 16: + case 17: + case 18: + case 19: + return "double"; + case 20: + return "solid"; + case 21: + return "double"; + case 22: + return "dashed"; + case 23: + return "dashed"; + case 24: + return "ridge"; + case 25: + return "grooved"; + default: + return "solid"; + } + } + + /** + * Java main() interface to interact with {@link WordToFoConverter} + * + *

+ * Usage: WordToFoConverter infile outfile + *

+ * Where infile is an input .doc file ( Word 97-2007) which will be rendered + * as XSL-FO into outfile + */ + public static void main( String[] args ) + { + if ( args.length < 2 ) + { + System.err + .println( "Usage: WordToFoConverter " ); + return; + } + + System.out.println( "Converting " + args[0] ); + System.out.println( "Saving output to " + args[1] ); + try + { + Document doc = WordToFoConverter.process( new File( args[0] ) ); + + FileWriter out = new FileWriter( args[1] ); + DOMSource domSource = new DOMSource( doc ); + StreamResult streamResult = new StreamResult( out ); + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer serializer = tf.newTransformer(); + // TODO set encoding from a command argument + serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" ); + serializer.setOutputProperty( OutputKeys.INDENT, "yes" ); + serializer.transform( domSource, streamResult ); + out.close(); + } + catch ( Exception e ) + { + e.printStackTrace(); + } + } + + static Document process( File docFile ) throws Exception + { + final HWPFDocumentCore hwpfDocument = WordToFoUtils.loadDoc( docFile ); + WordToFoConverter wordToFoConverter = new WordToFoConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToFoConverter.processDocument( hwpfDocument ); + return wordToFoConverter.getDocument(); + } + + private final Stack blocksProperies = new Stack(); + + protected final FoDocumentFacade foDocumentFacade; + + /** + * Creates new instance of {@link WordToFoConverter}. Can be used for output + * several {@link HWPFDocument}s into single FO document. + * + * @param document + * XML DOM Document used as XSL FO document. Shall support + * namespaces + */ + public WordToFoConverter( Document document ) + { + this.foDocumentFacade = new FoDocumentFacade( document ); + } + + protected String createPageMaster( SectionProperties sep, String type, + int section ) + { + float height = sep.getYaPage() / WordToFoUtils.TWIPS_PER_INCH; + float width = sep.getXaPage() / WordToFoUtils.TWIPS_PER_INCH; + float leftMargin = sep.getDxaLeft() / WordToFoUtils.TWIPS_PER_INCH; + float rightMargin = sep.getDxaRight() / WordToFoUtils.TWIPS_PER_INCH; + float topMargin = sep.getDyaTop() / WordToFoUtils.TWIPS_PER_INCH; + float bottomMargin = sep.getDyaBottom() / WordToFoUtils.TWIPS_PER_INCH; + + // add these to the header + String pageMasterName = type + "-page" + section; + + Element pageMaster = foDocumentFacade + .addSimplePageMaster( pageMasterName ); + pageMaster.setAttribute( "page-height", height + "in" ); + pageMaster.setAttribute( "page-width", width + "in" ); + + Element regionBody = foDocumentFacade.addRegionBody( pageMaster ); + regionBody.setAttribute( "margin", topMargin + "in " + rightMargin + + "in " + bottomMargin + "in " + leftMargin + "in" ); + + /* + * 6.4.14 fo:region-body + * + * The values of the padding and border-width traits must be "0". + */ + // WordToFoUtils.setBorder(regionBody, sep.getBrcTop(), "top"); + // WordToFoUtils.setBorder(regionBody, sep.getBrcBottom(), "bottom"); + // WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left"); + // WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right"); + + if ( sep.getCcolM1() > 0 ) + { + regionBody.setAttribute( "column-count", "" + + ( sep.getCcolM1() + 1 ) ); + if ( sep.getFEvenlySpaced() ) + { + regionBody.setAttribute( "column-gap", + ( sep.getDxaColumns() / WordToFoUtils.TWIPS_PER_INCH ) + + "in" ); + } + else + { + regionBody.setAttribute( "column-gap", "0.25in" ); + } + } + + return pageMasterName; + } + + public Document getDocument() + { + return foDocumentFacade.getDocument(); + } + + @Override + protected void outputCharacters( Element block, CharacterRun characterRun, + String text ) + { + BlockProperies blockProperies = this.blocksProperies.peek(); + Element inline = foDocumentFacade.createInline(); + if ( characterRun.isBold() != blockProperies.pBold ) + { + WordToFoUtils.setBold( inline, characterRun.isBold() ); + } + if ( characterRun.isItalic() != blockProperies.pItalic ) + { + WordToFoUtils.setItalic( inline, characterRun.isItalic() ); + } + if ( characterRun.getFontName() != null + && !AbstractWordUtils.equals( characterRun.getFontName(), + blockProperies.pFontName ) ) + { + WordToFoUtils.setFontFamily( inline, characterRun.getFontName() ); + } + if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize ) + { + WordToFoUtils.setFontSize( inline, characterRun.getFontSize() / 2 ); + } + WordToFoUtils.setCharactersProperties( characterRun, inline ); + block.appendChild( inline ); + + Text textNode = foDocumentFacade.createText( text ); + inline.appendChild( textNode ); + } + + protected void processHyperlink( HWPFDocumentCore hwpfDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String hyperlink, int beginTextInclusive, int endTextExclusive ) + { + Element basicLink = foDocumentFacade + .createBasicLinkExternal( hyperlink ); + currentBlock.appendChild( basicLink ); + + if ( beginTextInclusive < endTextExclusive ) + processCharacters( hwpfDocument, currentTableLevel, paragraph, + basicLink, characterRuns, beginTextInclusive, + endTextExclusive ); + } + + /** + * This method shall store image bytes in external file and convert it if + * necessary. Images shall be stored using PNG format (for bitmap) or SVG + * (for vector). Other formats may be not supported by your XSL FO + * processor. + *

+ * Please note the + * {@link WordToFoUtils#setPictureProperties(Picture, Element)} method. + * + * @param currentBlock + * currently processed FO element, like fo:block. Shall + * be used as parent of newly created + * fo:external-graphic or + * fo:instream-foreign-object + * @param inlined + * if image is inlined + * @param picture + * HWPF object, contained picture data and properties + */ + protected void processImage( Element currentBlock, boolean inlined, + Picture picture ) + { + // no default implementation -- skip + currentBlock.appendChild( foDocumentFacade.getDocument().createComment( + "Image link to '" + picture.suggestFullFileName() + + "' can be here" ) ); + } + + protected void processPageref( HWPFDocumentCore hwpfDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String pageref, int beginTextInclusive, int endTextExclusive ) + { + Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref ); + currentBlock.appendChild( basicLink ); + + if ( beginTextInclusive < endTextExclusive ) + processCharacters( hwpfDocument, currentTableLevel, paragraph, + basicLink, characterRuns, beginTextInclusive, + endTextExclusive ); + } + + protected void processParagraph( HWPFDocumentCore hwpfDocument, + Element parentFopElement, int currentTableLevel, + Paragraph paragraph, String bulletText ) + { + final Element block = foDocumentFacade.createBlock(); + parentFopElement.appendChild( block ); + + WordToFoUtils.setParagraphProperties( paragraph, block ); + + final int charRuns = paragraph.numCharacterRuns(); + + if ( charRuns == 0 ) + { + return; + } + + { + final String pFontName; + final int pFontSize; + final boolean pBold; + final boolean pItalic; + { + CharacterRun characterRun = paragraph.getCharacterRun( 0 ); + pFontSize = characterRun.getFontSize() / 2; + pFontName = characterRun.getFontName(); + pBold = characterRun.isBold(); + pItalic = characterRun.isItalic(); + } + WordToFoUtils.setFontFamily( block, pFontName ); + WordToFoUtils.setFontSize( block, pFontSize ); + WordToFoUtils.setBold( block, pBold ); + WordToFoUtils.setItalic( block, pItalic ); + + blocksProperies.push( new BlockProperies( pFontName, pFontSize, + pBold, pItalic ) ); + } + try + { + boolean haveAnyText = false; + + if ( WordToFoUtils.isNotEmpty( bulletText ) ) + { + Element inline = foDocumentFacade.createInline(); + block.appendChild( inline ); + + Text textNode = foDocumentFacade.createText( bulletText ); + inline.appendChild( textNode ); + + haveAnyText |= bulletText.trim().length() != 0; + } + + List characterRuns = WordToFoUtils + .findCharacterRuns( paragraph ); + haveAnyText = processCharacters( hwpfDocument, currentTableLevel, + paragraph, block, characterRuns, 0, characterRuns.size() ); + + if ( !haveAnyText ) + { + Element leader = foDocumentFacade.createLeader(); + block.appendChild( leader ); + } + } + finally + { + blocksProperies.pop(); + } + + return; + } + + protected void processSection( HWPFDocumentCore wordDocument, + Section section, int sectionCounter ) + { + String regularPage = createPageMaster( + WordToFoUtils.getSectionProperties( section ), "page", + sectionCounter ); + + Element pageSequence = foDocumentFacade.addPageSequence( regularPage ); + Element flow = foDocumentFacade.addFlowToPageSequence( pageSequence, + "xsl-region-body" ); + + processSectionParagraphes( wordDocument, flow, section, 0 ); + } + + protected void processSectionParagraphes( HWPFDocument wordDocument, + Element flow, Range range, int currentTableLevel ) + { + final Map allTables = new HashMap(); + for ( TableIterator tableIterator = WordToFoUtils.newTableIterator( + range, currentTableLevel + 1 ); tableIterator.hasNext(); ) + { + Table next = tableIterator.next(); + allTables.put( Integer.valueOf( next.getStartOffset() ), next ); + } + + final ListTables listTables = wordDocument.getListTables(); + int currentListInfo = 0; + + final int paragraphs = range.numParagraphs(); + for ( int p = 0; p < paragraphs; p++ ) + { + Paragraph paragraph = range.getParagraph( p ); + + if ( allTables.containsKey( Integer.valueOf( paragraph + .getStartOffset() ) ) ) + { + Table table = allTables.get( Integer.valueOf( paragraph + .getStartOffset() ) ); + processTable( wordDocument, flow, table, currentTableLevel + 1 ); + continue; + } + + if ( paragraph.isInTable() + && paragraph.getTableLevel() != currentTableLevel ) + { + continue; + } + + if ( paragraph.getIlfo() != currentListInfo ) + { + currentListInfo = paragraph.getIlfo(); + } + + if ( currentListInfo != 0 ) + { + if ( listTables != null ) + { + final ListFormatOverride listFormatOverride = listTables + .getOverride( paragraph.getIlfo() ); + + String label = WordToFoUtils.getBulletText( listTables, + paragraph, listFormatOverride.getLsid() ); + + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, label ); + } + else + { + logger.log( POILogger.WARN, + "Paragraph #" + paragraph.getStartOffset() + "-" + + paragraph.getEndOffset() + + " has reference to list structure #" + + currentListInfo + + ", but listTables not defined in file" ); + + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, WordToFoUtils.EMPTY ); + } + } + else + { + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, WordToFoUtils.EMPTY ); + } + } + + } + + protected void processTable( HWPFDocumentCore wordDocument, Element flow, + Table table, int thisTableLevel ) + { + Element tableHeader = foDocumentFacade.createTableHeader(); + Element tableBody = foDocumentFacade.createTableBody(); + + final int tableRows = table.numRows(); + + int maxColumns = Integer.MIN_VALUE; + for ( int r = 0; r < tableRows; r++ ) + { + maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() ); + } + + for ( int r = 0; r < tableRows; r++ ) + { + TableRow tableRow = table.getRow( r ); + + Element tableRowElement = foDocumentFacade.createTableRow(); + WordToFoUtils.setTableRowProperties( tableRow, tableRowElement ); + + final int rowCells = tableRow.numCells(); + for ( int c = 0; c < rowCells; c++ ) + { + TableCell tableCell = tableRow.getCell( c ); + + if ( tableCell.isMerged() && !tableCell.isFirstMerged() ) + continue; + + if ( tableCell.isVerticallyMerged() + && !tableCell.isFirstVerticallyMerged() ) + continue; + + Element tableCellElement = foDocumentFacade.createTableCell(); + WordToFoUtils.setTableCellProperties( tableRow, tableCell, + tableCellElement, r == 0, r == tableRows - 1, c == 0, + c == rowCells - 1 ); + + if ( tableCell.isFirstMerged() ) + { + int count = 0; + for ( int c1 = c; c1 < rowCells; c1++ ) + { + TableCell nextCell = tableRow.getCell( c1 ); + if ( nextCell.isMerged() ) + count++; + if ( !nextCell.isMerged() ) + break; + } + tableCellElement.setAttribute( "number-columns-spanned", "" + + count ); + } + else + { + if ( c == rowCells - 1 && c != maxColumns - 1 ) + { + tableCellElement.setAttribute( + "number-columns-spanned", "" + + ( maxColumns - c ) ); + } + } + + if ( tableCell.isFirstVerticallyMerged() ) + { + int count = 0; + for ( int r1 = r; r1 < tableRows; r1++ ) + { + TableRow nextRow = table.getRow( r1 ); + if ( nextRow.numCells() < c ) + break; + TableCell nextCell = nextRow.getCell( c ); + if ( nextCell.isVerticallyMerged() ) + count++; + if ( !nextCell.isVerticallyMerged() ) + break; + } + tableCellElement.setAttribute( "number-rows-spanned", "" + + count ); + } + + processSectionParagraphes( wordDocument, tableCellElement, + tableCell, thisTableLevel ); + + if ( !tableCellElement.hasChildNodes() ) + { + tableCellElement.appendChild( foDocumentFacade + .createBlock() ); + } + + tableRowElement.appendChild( tableCellElement ); + } + + if ( tableRow.isTableHeader() ) + { + tableHeader.appendChild( tableRowElement ); + } + else + { + tableBody.appendChild( tableRowElement ); + } + } + + final Element tableElement = foDocumentFacade.createTable(); + if ( tableHeader.hasChildNodes() ) + { + tableElement.appendChild( tableHeader ); + } + if ( tableBody.hasChildNodes() ) + { + tableElement.appendChild( tableBody ); + flow.appendChild( tableElement ); + } + else + { + logger.log( + POILogger.WARN, + "Table without body starting on offset " + + table.getStartOffset() + " -- " + + table.getEndOffset() ); + } + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoUtils.java new file mode 100644 index 0000000000..65953708eb --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoUtils.java @@ -0,0 +1,323 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import org.apache.poi.hwpf.usermodel.BorderCode; +import org.apache.poi.hwpf.usermodel.CharacterProperties; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.w3c.dom.Element; + +public class WordToFoUtils extends AbstractWordUtils +{ + public static void setBold( final Element element, final boolean bold ) + { + element.setAttribute( "font-weight", bold ? "bold" : "normal" ); + } + + public static void setBorder( Element element, BorderCode borderCode, + String where ) + { + if ( element == null ) + throw new IllegalArgumentException( "element is null" ); + + if ( borderCode == null || borderCode.getBorderType() == 0 ) + return; + + if ( isEmpty( where ) ) + { + element.setAttribute( "border-style", getBorderType( borderCode ) ); + element.setAttribute( "border-color", + getColor( borderCode.getColor() ) ); + element.setAttribute( "border-width", getBorderWidth( borderCode ) ); + } + else + { + element.setAttribute( "border-" + where + "-style", + getBorderType( borderCode ) ); + element.setAttribute( "border-" + where + "-color", + getColor( borderCode.getColor() ) ); + element.setAttribute( "border-" + where + "-width", + getBorderWidth( borderCode ) ); + } + } + + public static void setCharactersProperties( + final CharacterRun characterRun, final Element inline ) + { + final CharacterProperties clonedProperties = characterRun + .cloneProperties(); + StringBuilder textDecorations = new StringBuilder(); + + setBorder( inline, clonedProperties.getBrc(), EMPTY ); + + if ( characterRun.isCapitalized() ) + { + inline.setAttribute( "text-transform", "uppercase" ); + } + if ( characterRun.isHighlighted() ) + { + inline.setAttribute( "background-color", + getColor( clonedProperties.getIcoHighlight() ) ); + } + if ( characterRun.isStrikeThrough() ) + { + if ( textDecorations.length() > 0 ) + textDecorations.append( " " ); + textDecorations.append( "line-through" ); + } + if ( characterRun.isShadowed() ) + { + inline.setAttribute( "text-shadow", characterRun.getFontSize() / 24 + + "pt" ); + } + if ( characterRun.isSmallCaps() ) + { + inline.setAttribute( "font-variant", "small-caps" ); + } + if ( characterRun.getSubSuperScriptIndex() == 1 ) + { + inline.setAttribute( "baseline-shift", "super" ); + inline.setAttribute( "font-size", "smaller" ); + } + if ( characterRun.getSubSuperScriptIndex() == 2 ) + { + inline.setAttribute( "baseline-shift", "sub" ); + inline.setAttribute( "font-size", "smaller" ); + } + if ( characterRun.getUnderlineCode() > 0 ) + { + if ( textDecorations.length() > 0 ) + textDecorations.append( " " ); + textDecorations.append( "underline" ); + } + if ( characterRun.isVanished() ) + { + inline.setAttribute( "visibility", "hidden" ); + } + if ( textDecorations.length() > 0 ) + { + inline.setAttribute( "text-decoration", textDecorations.toString() ); + } + } + + public static void setFontFamily( final Element element, + final String fontFamily ) + { + if ( isEmpty( fontFamily ) ) + return; + + element.setAttribute( "font-family", fontFamily ); + } + + public static void setFontSize( final Element element, final int fontSize ) + { + element.setAttribute( "font-size", String.valueOf( fontSize ) ); + } + + public static void setIndent( Paragraph paragraph, Element block ) + { + if ( paragraph.getFirstLineIndent() != 0 ) + { + block.setAttribute( + "text-indent", + String.valueOf( paragraph.getFirstLineIndent() + / TWIPS_PER_PT ) + + "pt" ); + } + if ( paragraph.getIndentFromLeft() != 0 ) + { + block.setAttribute( + "start-indent", + String.valueOf( paragraph.getIndentFromLeft() + / TWIPS_PER_PT ) + + "pt" ); + } + if ( paragraph.getIndentFromRight() != 0 ) + { + block.setAttribute( + "end-indent", + String.valueOf( paragraph.getIndentFromRight() + / TWIPS_PER_PT ) + + "pt" ); + } + if ( paragraph.getSpacingBefore() != 0 ) + { + block.setAttribute( + "space-before", + String.valueOf( paragraph.getSpacingBefore() / TWIPS_PER_PT ) + + "pt" ); + } + if ( paragraph.getSpacingAfter() != 0 ) + { + block.setAttribute( "space-after", + String.valueOf( paragraph.getSpacingAfter() / TWIPS_PER_PT ) + + "pt" ); + } + } + + public static void setItalic( final Element element, final boolean italic ) + { + element.setAttribute( "font-style", italic ? "italic" : "normal" ); + } + + public static void setJustification( Paragraph paragraph, + final Element element ) + { + String justification = getJustification( paragraph.getJustification() ); + if ( isNotEmpty( justification ) ) + element.setAttribute( "text-align", justification ); + } + + public static void setParagraphProperties( Paragraph paragraph, + Element block ) + { + setIndent( paragraph, block ); + setJustification( paragraph, block ); + + setBorder( block, paragraph.getBottomBorder(), "bottom" ); + setBorder( block, paragraph.getLeftBorder(), "left" ); + setBorder( block, paragraph.getRightBorder(), "right" ); + setBorder( block, paragraph.getTopBorder(), "top" ); + + if ( paragraph.pageBreakBefore() ) + { + block.setAttribute( "break-before", "page" ); + } + + block.setAttribute( "hyphenate", + String.valueOf( paragraph.isAutoHyphenated() ) ); + + if ( paragraph.keepOnPage() ) + { + block.setAttribute( "keep-together.within-page", "always" ); + } + + if ( paragraph.keepWithNext() ) + { + block.setAttribute( "keep-with-next.within-page", "always" ); + } + + block.setAttribute( "linefeed-treatment", "preserve" ); + block.setAttribute( "white-space-collapse", "false" ); + } + + public static void setPictureProperties( Picture picture, + Element graphicElement ) + { + final int aspectRatioX = picture.getAspectRatioX(); + final int aspectRatioY = picture.getAspectRatioY(); + + if ( aspectRatioX > 0 ) + { + graphicElement + .setAttribute( "content-width", ( ( picture.getDxaGoal() + * aspectRatioX / 100 ) / TWIPS_PER_PT ) + + "pt" ); + } + else + graphicElement.setAttribute( "content-width", + ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" ); + + if ( aspectRatioY > 0 ) + graphicElement + .setAttribute( "content-height", ( ( picture.getDyaGoal() + * aspectRatioY / 100 ) / TWIPS_PER_PT ) + + "pt" ); + else + graphicElement.setAttribute( "content-height", + ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" ); + + if ( aspectRatioX <= 0 || aspectRatioY <= 0 ) + { + graphicElement.setAttribute( "scaling", "uniform" ); + } + else + { + graphicElement.setAttribute( "scaling", "non-uniform" ); + } + + graphicElement.setAttribute( "vertical-align", "text-bottom" ); + + if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0 + || picture.getDyaCropBottom() != 0 + || picture.getDxaCropLeft() != 0 ) + { + int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT; + int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT; + int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT; + int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT; + graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, " + + rectRight + "pt, " + rectBottom + "pt, " + rectLeft + + "pt)" ); + graphicElement.setAttribute( "oveerflow", "hidden" ); + } + } + + public static void setTableCellProperties( TableRow tableRow, + TableCell tableCell, Element element, boolean toppest, + boolean bottomest, boolean leftest, boolean rightest ) + { + element.setAttribute( "width", ( tableCell.getWidth() / TWIPS_PER_INCH ) + + "in" ); + element.setAttribute( "padding-start", + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" ); + element.setAttribute( "padding-end", + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" ); + + BorderCode top = tableCell.getBrcTop() != null + && tableCell.getBrcTop().getBorderType() != 0 ? tableCell + .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow + .getHorizontalBorder(); + BorderCode bottom = tableCell.getBrcBottom() != null + && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell + .getBrcBottom() : bottomest ? tableRow.getBottomBorder() + : tableRow.getHorizontalBorder(); + + BorderCode left = tableCell.getBrcLeft() != null + && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell + .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow + .getVerticalBorder(); + BorderCode right = tableCell.getBrcRight() != null + && tableCell.getBrcRight().getBorderType() != 0 ? tableCell + .getBrcRight() : rightest ? tableRow.getRightBorder() + : tableRow.getVerticalBorder(); + + setBorder( element, bottom, "bottom" ); + setBorder( element, left, "left" ); + setBorder( element, right, "right" ); + setBorder( element, top, "top" ); + } + + public static void setTableRowProperties( TableRow tableRow, + Element tableRowElement ) + { + if ( tableRow.getRowHeight() > 0 ) + { + tableRowElement.setAttribute( "height", + ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in" ); + } + if ( !tableRow.cantSplit() ) + { + tableRowElement.setAttribute( "keep-together", "always" ); + } + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java new file mode 100644 index 0000000000..c931acb9c7 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java @@ -0,0 +1,481 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import java.io.File; +import java.io.FileWriter; +import java.util.List; +import java.util.Stack; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.Section; +import org.apache.poi.hwpf.usermodel.SectionProperties; +import org.apache.poi.hwpf.usermodel.Table; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH; + +/** + * Converts Word files (95-2007) into HTML files. + *

+ * This implementation doesn't create images or links to them. This can be + * changed by overriding {@link #processImage(Element, boolean, Picture)} + * method. + * + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ +public class WordToHtmlConverter extends AbstractWordConverter +{ + + /** + * Holds properties values, applied to current p element. Those + * properties shall not be doubled in children span elements. + */ + private static class BlockProperies + { + final String pFontName; + final int pFontSize; + + public BlockProperies( String pFontName, int pFontSize ) + { + this.pFontName = pFontName; + this.pFontSize = pFontSize; + } + } + + private static final POILogger logger = POILogFactory + .getLogger( WordToHtmlConverter.class ); + + private static String getSectionStyle( Section section ) + { + SectionProperties sep = WordToHtmlUtils.getSectionProperties( section ); + + float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH; + float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH; + float topMargin = sep.getDyaTop() / TWIPS_PER_INCH; + float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH; + + String style = "margin: " + topMargin + "in " + rightMargin + "in " + + bottomMargin + "in " + leftMargin + "in; "; + + if ( sep.getCcolM1() > 0 ) + { + style += "column-count: " + ( sep.getCcolM1() + 1 ) + "; "; + if ( sep.getFEvenlySpaced() ) + { + style += "column-gap: " + + ( sep.getDxaColumns() / TWIPS_PER_INCH ) + "in; "; + } + else + { + style += "column-gap: 0.25in; "; + } + } + return style; + } + + /** + * Java main() interface to interact with {@link WordToHtmlConverter} + * + *

+ * Usage: WordToHtmlConverter infile outfile + *

+ * Where infile is an input .doc file ( Word 95-2007) which will be rendered + * as HTML into outfile + */ + public static void main( String[] args ) + { + if ( args.length < 2 ) + { + System.err + .println( "Usage: WordToHtmlConverter " ); + return; + } + + System.out.println( "Converting " + args[0] ); + System.out.println( "Saving output to " + args[1] ); + try + { + Document doc = WordToHtmlConverter.process( new File( args[0] ) ); + + FileWriter out = new FileWriter( args[1] ); + DOMSource domSource = new DOMSource( doc ); + StreamResult streamResult = new StreamResult( out ); + + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer serializer = tf.newTransformer(); + // TODO set encoding from a command argument + serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" ); + serializer.setOutputProperty( OutputKeys.INDENT, "yes" ); + serializer.setOutputProperty( OutputKeys.METHOD, "html" ); + serializer.transform( domSource, streamResult ); + out.close(); + } + catch ( Exception e ) + { + e.printStackTrace(); + } + } + + static Document process( File docFile ) throws Exception + { + final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile ); + WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToHtmlConverter.processDocument( wordDocument ); + return wordToHtmlConverter.getDocument(); + } + + private final Stack blocksProperies = new Stack(); + + private final HtmlDocumentFacade htmlDocumentFacade; + + /** + * Creates new instance of {@link WordToHtmlConverter}. Can be used for + * output several {@link HWPFDocument}s into single HTML document. + * + * @param document + * XML DOM Document used as HTML document + */ + public WordToHtmlConverter( Document document ) + { + this.htmlDocumentFacade = new HtmlDocumentFacade( document ); + } + + public Document getDocument() + { + return htmlDocumentFacade.getDocument(); + } + + @Override + protected void outputCharacters( Element pElement, + CharacterRun characterRun, String text ) + { + Element span = htmlDocumentFacade.document.createElement( "span" ); + pElement.appendChild( span ); + + StringBuilder style = new StringBuilder(); + BlockProperies blockProperies = this.blocksProperies.peek(); + if ( characterRun.getFontName() != null + && !WordToHtmlUtils.equals( characterRun.getFontName(), + blockProperies.pFontName ) ) + { + style.append( "font-family: " + characterRun.getFontName() + "; " ); + } + if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize ) + { + style.append( "font-size: " + characterRun.getFontSize() / 2 + "; " ); + } + + WordToHtmlUtils.addCharactersProperties( characterRun, style ); + if ( style.length() != 0 ) + span.setAttribute( "style", style.toString() ); + + Text textNode = htmlDocumentFacade.createText( text ); + span.appendChild( textNode ); + } + + protected void processHyperlink( HWPFDocumentCore wordDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String hyperlink, int beginTextInclusive, int endTextExclusive ) + { + Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink ); + currentBlock.appendChild( basicLink ); + + if ( beginTextInclusive < endTextExclusive ) + processCharacters( wordDocument, currentTableLevel, paragraph, + basicLink, characterRuns, beginTextInclusive, + endTextExclusive ); + } + + /** + * This method shall store image bytes in external file and convert it if + * necessary. Images shall be stored using PNG format. Other formats may be + * not supported by user browser. + *

+ * Please note the + * {@link WordToHtmlUtils#setPictureProperties(Picture, Element)} method. + * + * @param currentBlock + * currently processed HTML element, like p. Shall be + * used as parent of newly created img + * @param inlined + * if image is inlined + * @param picture + * HWPF object, contained picture data and properties + */ + protected void processImage( Element currentBlock, boolean inlined, + Picture picture ) + { + // no default implementation -- skip + currentBlock.appendChild( htmlDocumentFacade.document + .createComment( "Image link to '" + + picture.suggestFullFileName() + "' can be here" ) ); + } + + protected void processPageref( HWPFDocumentCore hwpfDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String pageref, int beginTextInclusive, int endTextExclusive ) + { + Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref ); + currentBlock.appendChild( basicLink ); + + if ( beginTextInclusive < endTextExclusive ) + processCharacters( hwpfDocument, currentTableLevel, paragraph, + basicLink, characterRuns, beginTextInclusive, + endTextExclusive ); + } + + protected void processParagraph( HWPFDocumentCore hwpfDocument, + Element parentFopElement, int currentTableLevel, + Paragraph paragraph, String bulletText ) + { + final Element pElement = htmlDocumentFacade.createParagraph(); + parentFopElement.appendChild( pElement ); + + StringBuilder style = new StringBuilder(); + WordToHtmlUtils.addParagraphProperties( paragraph, style ); + + final int charRuns = paragraph.numCharacterRuns(); + + if ( charRuns == 0 ) + { + return; + } + + { + final String pFontName; + final int pFontSize; + final CharacterRun characterRun = paragraph.getCharacterRun( 0 ); + if ( characterRun != null ) + { + pFontSize = characterRun.getFontSize() / 2; + pFontName = characterRun.getFontName(); + WordToHtmlUtils.addFontFamily( pFontName, style ); + WordToHtmlUtils.addFontSize( pFontSize, style ); + } + else + { + pFontSize = -1; + pFontName = WordToHtmlUtils.EMPTY; + } + blocksProperies.push( new BlockProperies( pFontName, pFontSize ) ); + } + try + { + if ( WordToHtmlUtils.isNotEmpty( bulletText ) ) + { + Text textNode = htmlDocumentFacade.createText( bulletText ); + pElement.appendChild( textNode ); + } + + List characterRuns = WordToHtmlUtils + .findCharacterRuns( paragraph ); + processCharacters( hwpfDocument, currentTableLevel, paragraph, + pElement, characterRuns, 0, characterRuns.size() ); + } + finally + { + blocksProperies.pop(); + } + + if ( style.length() > 0 ) + pElement.setAttribute( "style", style.toString() ); + + return; + } + + protected void processSection( HWPFDocumentCore wordDocument, + Section section, int sectionCounter ) + { + Element div = htmlDocumentFacade.document.createElement( "div" ); + div.setAttribute( "style", getSectionStyle( section ) ); + htmlDocumentFacade.body.appendChild( div ); + + processSectionParagraphes( wordDocument, div, section, 0 ); + } + + @Override + protected void processSingleSection( HWPFDocumentCore wordDocument, + Section section ) + { + htmlDocumentFacade.body.setAttribute( "style", + getSectionStyle( section ) ); + + processSectionParagraphes( wordDocument, htmlDocumentFacade.body, + section, 0 ); + } + + protected void processTable( HWPFDocumentCore hwpfDocument, Element flow, + Table table, int thisTableLevel ) + { + Element tableHeader = htmlDocumentFacade.createTableHeader(); + Element tableBody = htmlDocumentFacade.createTableBody(); + + final int tableRows = table.numRows(); + + int maxColumns = Integer.MIN_VALUE; + for ( int r = 0; r < tableRows; r++ ) + { + maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() ); + } + + for ( int r = 0; r < tableRows; r++ ) + { + TableRow tableRow = table.getRow( r ); + + Element tableRowElement = htmlDocumentFacade.createTableRow(); + StringBuilder tableRowStyle = new StringBuilder(); + WordToHtmlUtils.addTableRowProperties( tableRow, tableRowStyle ); + + final int rowCells = tableRow.numCells(); + for ( int c = 0; c < rowCells; c++ ) + { + TableCell tableCell = tableRow.getCell( c ); + + if ( tableCell.isMerged() && !tableCell.isFirstMerged() ) + continue; + + if ( tableCell.isVerticallyMerged() + && !tableCell.isFirstVerticallyMerged() ) + continue; + + Element tableCellElement; + if ( tableRow.isTableHeader() ) + { + tableCellElement = htmlDocumentFacade + .createTableHeaderCell(); + } + else + { + tableCellElement = htmlDocumentFacade.createTableCell(); + } + StringBuilder tableCellStyle = new StringBuilder(); + WordToHtmlUtils.addTableCellProperties( tableRow, tableCell, + r == 0, r == tableRows - 1, c == 0, c == rowCells - 1, + tableCellStyle ); + + if ( tableCell.isFirstMerged() ) + { + int count = 0; + for ( int c1 = c; c1 < rowCells; c1++ ) + { + TableCell nextCell = tableRow.getCell( c1 ); + if ( nextCell.isMerged() ) + count++; + if ( !nextCell.isMerged() ) + break; + } + tableCellElement.setAttribute( "colspan", "" + count ); + } + else + { + if ( c == rowCells - 1 && c != maxColumns - 1 ) + { + tableCellElement.setAttribute( "colspan", "" + + ( maxColumns - c ) ); + } + } + + if ( tableCell.isFirstVerticallyMerged() ) + { + int count = 0; + for ( int r1 = r; r1 < tableRows; r1++ ) + { + TableRow nextRow = table.getRow( r1 ); + if ( nextRow.numCells() < c ) + break; + TableCell nextCell = nextRow.getCell( c ); + if ( nextCell.isVerticallyMerged() ) + count++; + if ( !nextCell.isVerticallyMerged() ) + break; + } + tableCellElement.setAttribute( "rowspan", "" + count ); + } + + processSectionParagraphes( hwpfDocument, tableCellElement, + tableCell, thisTableLevel ); + + if ( !tableCellElement.hasChildNodes() ) + { + tableCellElement.appendChild( htmlDocumentFacade + .createParagraph() ); + } + if ( tableCellStyle.length() > 0 ) + tableCellElement.setAttribute( "style", + tableCellStyle.toString() ); + + tableRowElement.appendChild( tableCellElement ); + } + + if ( tableRowStyle.length() > 0 ) + tableRowElement + .setAttribute( "style", tableRowStyle.toString() ); + + if ( tableRow.isTableHeader() ) + { + tableHeader.appendChild( tableRowElement ); + } + else + { + tableBody.appendChild( tableRowElement ); + } + + } + + final Element tableElement = htmlDocumentFacade.createTable(); + if ( tableHeader.hasChildNodes() ) + { + tableElement.appendChild( tableHeader ); + } + if ( tableBody.hasChildNodes() ) + { + tableElement.appendChild( tableBody ); + flow.appendChild( tableElement ); + } + else + { + logger.log( + POILogger.WARN, + "Table without body starting on offset " + + table.getStartOffset() + " -- " + + table.getEndOffset() ); + } + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlUtils.java new file mode 100644 index 0000000000..598def658a --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlUtils.java @@ -0,0 +1,292 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import org.apache.poi.hwpf.usermodel.BorderCode; +import org.apache.poi.hwpf.usermodel.CharacterProperties; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.w3c.dom.Element; + +public class WordToHtmlUtils extends AbstractWordUtils +{ + public static void addBold( final boolean bold, StringBuilder style ) + { + style.append( "font-weight: " + ( bold ? "bold" : "normal" ) + ";" ); + } + + public static void addBorder( BorderCode borderCode, String where, + StringBuilder style ) + { + if ( borderCode == null || borderCode.getBorderType() == 0 ) + return; + + if ( isEmpty( where ) ) + { + style.append( "border-style: " + getBorderType( borderCode ) + "; " ); + style.append( "border-color: " + getColor( borderCode.getColor() ) + + "; " ); + style.append( "border-width: " + getBorderWidth( borderCode ) + + "; " ); + } + else + { + style.append( "border-" + where + "-style: " + + getBorderType( borderCode ) + "; " ); + style.append( "border-" + where + "-color: " + + getColor( borderCode.getColor() ) + "; " ); + style.append( "border-" + where + "-width: " + + getBorderWidth( borderCode ) + "; " ); + } + } + + public static void addCharactersProperties( + final CharacterRun characterRun, StringBuilder style ) + { + final CharacterProperties clonedProperties = characterRun + .cloneProperties(); + + if ( characterRun.isBold() ) + { + style.append( "font-weight: bold; " ); + } + if ( characterRun.isItalic() ) + { + style.append( "font-style: italic; " ); + } + + addBorder( clonedProperties.getBrc(), EMPTY, style ); + + if ( characterRun.isCapitalized() ) + { + style.append( "text-transform: uppercase; " ); + } + if ( characterRun.isHighlighted() ) + { + style.append( "background-color: " + + getColor( clonedProperties.getIcoHighlight() ) + "; " ); + } + if ( characterRun.isStrikeThrough() ) + { + style.append( "text-decoration: line-through; " ); + } + if ( characterRun.isShadowed() ) + { + style.append( "text-shadow: " + characterRun.getFontSize() / 24 + + "pt; " ); + } + if ( characterRun.isSmallCaps() ) + { + style.append( "font-variant: small-caps; " ); + } + if ( characterRun.getSubSuperScriptIndex() == 1 ) + { + style.append( "baseline-shift: super; " ); + style.append( "font-size: smaller; " ); + } + if ( characterRun.getSubSuperScriptIndex() == 2 ) + { + style.append( "baseline-shift: sub; " ); + style.append( "font-size: smaller; " ); + } + if ( characterRun.getUnderlineCode() > 0 ) + { + style.append( "text-decoration: underline; " ); + } + if ( characterRun.isVanished() ) + { + style.append( "visibility: hidden; " ); + } + } + + public static void addFontFamily( final String fontFamily, + StringBuilder style ) + { + if ( isEmpty( fontFamily ) ) + return; + + style.append( "font-family: " + fontFamily ); + } + + public static void addFontSize( final int fontSize, StringBuilder style ) + { + style.append( "font-size: " + fontSize ); + } + + public static void addIndent( Paragraph paragraph, StringBuilder style ) + { + addIndent( style, "text-indent", paragraph.getFirstLineIndent() ); + addIndent( style, "start-indent", paragraph.getIndentFromLeft() ); + addIndent( style, "end-indent", paragraph.getIndentFromRight() ); + addIndent( style, "space-before", paragraph.getSpacingBefore() ); + addIndent( style, "space-after", paragraph.getSpacingAfter() ); + } + + private static void addIndent( StringBuilder style, final String cssName, + final int twipsValue ) + { + if ( twipsValue == 0 ) + return; + + style.append( cssName + ": " + ( twipsValue / TWIPS_PER_PT ) + "pt; " ); + } + + public static void addJustification( Paragraph paragraph, + final StringBuilder style ) + { + String justification = getJustification( paragraph.getJustification() ); + if ( isNotEmpty( justification ) ) + style.append( "text-align: " + justification + "; " ); + } + + public static void addParagraphProperties( Paragraph paragraph, + StringBuilder style ) + { + addIndent( paragraph, style ); + addJustification( paragraph, style ); + + addBorder( paragraph.getBottomBorder(), "bottom", style ); + addBorder( paragraph.getLeftBorder(), "left", style ); + addBorder( paragraph.getRightBorder(), "right", style ); + addBorder( paragraph.getTopBorder(), "top", style ); + + if ( paragraph.pageBreakBefore() ) + { + style.append( "break-before: page; " ); + } + + style.append( "hyphenate: " + paragraph.isAutoHyphenated() + "; " ); + + if ( paragraph.keepOnPage() ) + { + style.append( "keep-together.within-page: always; " ); + } + + if ( paragraph.keepWithNext() ) + { + style.append( "keep-with-next.within-page: always; " ); + } + + style.append( "linefeed-treatment: preserve; " ); + style.append( "white-space-collapse: false; " ); + } + + public static void addTableCellProperties( TableRow tableRow, + TableCell tableCell, boolean toppest, boolean bottomest, + boolean leftest, boolean rightest, StringBuilder style ) + { + style.append( "width: " + ( tableCell.getWidth() / TWIPS_PER_INCH ) + + "in; " ); + style.append( "padding-start: " + + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " ); + style.append( "padding-end: " + + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " ); + + BorderCode top = tableCell.getBrcTop() != null + && tableCell.getBrcTop().getBorderType() != 0 ? tableCell + .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow + .getHorizontalBorder(); + BorderCode bottom = tableCell.getBrcBottom() != null + && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell + .getBrcBottom() : bottomest ? tableRow.getBottomBorder() + : tableRow.getHorizontalBorder(); + + BorderCode left = tableCell.getBrcLeft() != null + && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell + .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow + .getVerticalBorder(); + BorderCode right = tableCell.getBrcRight() != null + && tableCell.getBrcRight().getBorderType() != 0 ? tableCell + .getBrcRight() : rightest ? tableRow.getRightBorder() + : tableRow.getVerticalBorder(); + + addBorder( bottom, "bottom", style ); + addBorder( left, "left", style ); + addBorder( right, "right", style ); + addBorder( top, "top", style ); + } + + public static void addTableRowProperties( TableRow tableRow, + StringBuilder style ) + { + if ( tableRow.getRowHeight() > 0 ) + { + style.append( "height: " + + ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in; " ); + } + if ( !tableRow.cantSplit() ) + { + style.append( "keep-together: always; " ); + } + } + + public static void setPictureProperties( Picture picture, + Element graphicElement ) + { + final int aspectRatioX = picture.getAspectRatioX(); + final int aspectRatioY = picture.getAspectRatioY(); + + if ( aspectRatioX > 0 ) + { + graphicElement + .setAttribute( "content-width", ( ( picture.getDxaGoal() + * aspectRatioX / 100 ) / TWIPS_PER_PT ) + + "pt" ); + } + else + graphicElement.setAttribute( "content-width", + ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" ); + + if ( aspectRatioY > 0 ) + graphicElement + .setAttribute( "content-height", ( ( picture.getDyaGoal() + * aspectRatioY / 100 ) / TWIPS_PER_PT ) + + "pt" ); + else + graphicElement.setAttribute( "content-height", + ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" ); + + if ( aspectRatioX <= 0 || aspectRatioY <= 0 ) + { + graphicElement.setAttribute( "scaling", "uniform" ); + } + else + { + graphicElement.setAttribute( "scaling", "non-uniform" ); + } + + graphicElement.setAttribute( "vertical-align", "text-bottom" ); + + if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0 + || picture.getDyaCropBottom() != 0 + || picture.getDxaCropLeft() != 0 ) + { + int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT; + int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT; + int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT; + int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT; + graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, " + + rectRight + "pt, " + rectBottom + "pt, " + rectLeft + + "pt)" ); + graphicElement.setAttribute( "oveerflow", "hidden" ); + } + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java deleted file mode 100644 index f13d9a1f6c..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java +++ /dev/null @@ -1,365 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.poi.hwpf.HWPFDocument; -import org.apache.poi.hwpf.HWPFDocumentCore; -import org.apache.poi.hwpf.model.ListFormatOverride; -import org.apache.poi.hwpf.model.ListTables; -import org.apache.poi.hwpf.usermodel.CharacterRun; -import org.apache.poi.hwpf.usermodel.Paragraph; -import org.apache.poi.hwpf.usermodel.Picture; -import org.apache.poi.hwpf.usermodel.Range; -import org.apache.poi.hwpf.usermodel.Section; -import org.apache.poi.hwpf.usermodel.Table; -import org.apache.poi.hwpf.usermodel.TableIterator; -import org.apache.poi.util.POILogFactory; -import org.apache.poi.util.POILogger; -import org.w3c.dom.Document; -import org.w3c.dom.Element; - -public abstract class AbstractWordExtractor -{ - private static final byte BEL_MARK = 7; - - private static final byte FIELD_BEGIN_MARK = 19; - - private static final byte FIELD_END_MARK = 21; - - private static final byte FIELD_SEPARATOR_MARK = 20; - - private static final POILogger logger = POILogFactory - .getLogger( AbstractWordExtractor.class ); - - public abstract Document getDocument(); - - protected abstract void outputCharacters( Element block, - CharacterRun characterRun, String text ); - - protected boolean processCharacters( HWPFDocumentCore hwpfDocument, - int currentTableLevel, Paragraph paragraph, final Element block, - List characterRuns, final int start, final int end ) - { - boolean haveAnyText = false; - - for ( int c = start; c < end; c++ ) - { - CharacterRun characterRun = characterRuns.get( c ); - - if ( characterRun == null ) - throw new AssertionError(); - - if ( hwpfDocument instanceof HWPFDocument - && ( (HWPFDocument) hwpfDocument ).getPicturesTable() - .hasPicture( characterRun ) ) - { - HWPFDocument newFormat = (HWPFDocument) hwpfDocument; - Picture picture = newFormat.getPicturesTable().extractPicture( - characterRun, true ); - - processImage( block, characterRun.text().charAt( 0 ) == 0x01, - picture ); - continue; - } - - String text = characterRun.text(); - if ( text.getBytes().length == 0 ) - continue; - - if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) - { - int skipTo = tryField( hwpfDocument, paragraph, - currentTableLevel, characterRuns, c, block ); - - if ( skipTo != c ) - { - c = skipTo; - continue; - } - - continue; - } - if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) - { - // shall not appear without FIELD_BEGIN_MARK - continue; - } - if ( text.getBytes()[0] == FIELD_END_MARK ) - { - // shall not appear without FIELD_BEGIN_MARK - continue; - } - - if ( characterRun.isSpecialCharacter() || characterRun.isObj() - || characterRun.isOle2() ) - { - continue; - } - - if ( text.endsWith( "\r" ) - || ( text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0 ) ) - text = text.substring( 0, text.length() - 1 ); - - outputCharacters( block, characterRun, text ); - - haveAnyText |= text.trim().length() != 0; - } - - return haveAnyText; - } - - public void processDocument( HWPFDocumentCore wordDocument ) - { - final Range range = wordDocument.getRange(); - for ( int s = 0; s < range.numSections(); s++ ) - { - processSection( wordDocument, range.getSection( s ), s ); - } - } - - protected void processField( HWPFDocumentCore wordDocument, - Element currentBlock, Paragraph paragraph, int currentTableLevel, - List characterRuns, int beginMark, int separatorMark, - int endMark ) - { - - Pattern hyperlinkPattern = Pattern - .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" ); - Pattern pagerefPattern = Pattern - .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" ); - - if ( separatorMark - beginMark > 1 ) - { - int index = beginMark + 1; - CharacterRun firstAfterBegin = null; - while ( index < separatorMark ) - { - firstAfterBegin = paragraph.getCharacterRun( index ); - if ( firstAfterBegin == null ) - { - logger.log( POILogger.WARN, - "Paragraph " + paragraph.getStartOffset() + "--" - + paragraph.getEndOffset() - + " contains null CharacterRun #" + index ); - index++; - continue; - } - break; - } - - if ( firstAfterBegin != null ) - { - final Matcher hyperlinkMatcher = hyperlinkPattern - .matcher( firstAfterBegin.text() ); - if ( hyperlinkMatcher.matches() ) - { - String hyperlink = hyperlinkMatcher.group( 1 ); - processHyperlink( wordDocument, currentBlock, paragraph, - characterRuns, currentTableLevel, hyperlink, - separatorMark + 1, endMark ); - return; - } - - final Matcher pagerefMatcher = pagerefPattern - .matcher( firstAfterBegin.text() ); - if ( pagerefMatcher.matches() ) - { - String pageref = pagerefMatcher.group( 1 ); - processPageref( wordDocument, currentBlock, paragraph, - characterRuns, currentTableLevel, pageref, - separatorMark + 1, endMark ); - return; - } - } - } - - StringBuilder debug = new StringBuilder( "Unsupported field type: \n" ); - for ( int i = beginMark; i <= endMark; i++ ) - { - debug.append( "\t" ); - debug.append( paragraph.getCharacterRun( i ) ); - debug.append( "\n" ); - } - logger.log( POILogger.WARN, debug ); - - // just output field value - if ( separatorMark + 1 < endMark ) - processCharacters( wordDocument, currentTableLevel, paragraph, - currentBlock, characterRuns, separatorMark + 1, endMark ); - - return; - } - - protected abstract void processHyperlink( HWPFDocumentCore wordDocument, - Element currentBlock, Paragraph paragraph, - List characterRuns, int currentTableLevel, - String hyperlink, int i, int endMark ); - - protected abstract void processImage( Element currentBlock, - boolean inlined, Picture picture ); - - protected abstract void processPageref( HWPFDocumentCore wordDocument, - Element currentBlock, Paragraph paragraph, - List characterRuns, int currentTableLevel, - String pageref, int beginTextInclusive, int endTextExclusive ); - - protected abstract void processParagraph( HWPFDocumentCore wordDocument, - Element parentFopElement, int currentTableLevel, - Paragraph paragraph, String bulletText ); - - protected abstract void processSection( HWPFDocumentCore wordDocument, - Section section, int s ); - - protected void processSectionParagraphes( HWPFDocumentCore wordDocument, - Element flow, Range range, int currentTableLevel ) - { - final Map allTables = new HashMap(); - for ( TableIterator tableIterator = AbstractWordUtils.newTableIterator( - range, currentTableLevel + 1 ); tableIterator.hasNext(); ) - { - Table next = tableIterator.next(); - allTables.put( Integer.valueOf( next.getStartOffset() ), next ); - } - - final ListTables listTables = wordDocument.getListTables(); - int currentListInfo = 0; - - final int paragraphs = range.numParagraphs(); - for ( int p = 0; p < paragraphs; p++ ) - { - Paragraph paragraph = range.getParagraph( p ); - - if ( allTables.containsKey( Integer.valueOf( paragraph - .getStartOffset() ) ) ) - { - Table table = allTables.get( Integer.valueOf( paragraph - .getStartOffset() ) ); - processTable( wordDocument, flow, table, currentTableLevel + 1 ); - continue; - } - - if ( paragraph.isInTable() - && paragraph.getTableLevel() != currentTableLevel ) - { - continue; - } - - if ( paragraph.getIlfo() != currentListInfo ) - { - currentListInfo = paragraph.getIlfo(); - } - - if ( currentListInfo != 0 ) - { - if ( listTables != null ) - { - final ListFormatOverride listFormatOverride = listTables - .getOverride( paragraph.getIlfo() ); - - String label = AbstractWordUtils.getBulletText( listTables, - paragraph, listFormatOverride.getLsid() ); - - processParagraph( wordDocument, flow, currentTableLevel, - paragraph, label ); - } - else - { - logger.log( POILogger.WARN, - "Paragraph #" + paragraph.getStartOffset() + "-" - + paragraph.getEndOffset() - + " has reference to list structure #" - + currentListInfo - + ", but listTables not defined in file" ); - - processParagraph( wordDocument, flow, currentTableLevel, - paragraph, AbstractWordUtils.EMPTY ); - } - } - else - { - processParagraph( wordDocument, flow, currentTableLevel, - paragraph, AbstractWordUtils.EMPTY ); - } - } - - } - - protected void processSingleSection( HWPFDocumentCore wordDocument, - Section section ) - { - processSection( wordDocument, section, 0 ); - } - - protected abstract void processTable( HWPFDocumentCore wordDocument, - Element flow, Table table, int newTableLevel ); - - protected int tryField( HWPFDocumentCore wordDocument, Paragraph paragraph, - int currentTableLevel, List characterRuns, - int beginMark, Element currentBlock ) - { - int separatorMark = -1; - int endMark = -1; - for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ ) - { - CharacterRun characterRun = paragraph.getCharacterRun( c ); - - String text = characterRun.text(); - if ( text.getBytes().length == 0 ) - continue; - - if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) - { - if ( separatorMark != -1 ) - { - // double; - return beginMark; - } - - separatorMark = c; - continue; - } - - if ( text.getBytes()[0] == FIELD_END_MARK ) - { - if ( endMark != -1 ) - { - // double; - return beginMark; - } - - endMark = c; - break; - } - - } - - if ( separatorMark == -1 || endMark == -1 ) - return beginMark; - - processField( wordDocument, currentBlock, paragraph, currentTableLevel, - characterRuns, beginMark, separatorMark, endMark ); - - return endMark; - } - -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java deleted file mode 100644 index 89849c15eb..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java +++ /dev/null @@ -1,404 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import java.io.Closeable; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.lang.reflect.Constructor; -import java.lang.reflect.Field; -import java.lang.reflect.Method; -import java.util.ArrayList; -import java.util.List; - -import org.apache.poi.hwpf.HWPFDocument; -import org.apache.poi.hwpf.HWPFDocumentCore; -import org.apache.poi.hwpf.HWPFOldDocument; -import org.apache.poi.hwpf.OldWordFileFormatException; -import org.apache.poi.hwpf.model.CHPX; -import org.apache.poi.hwpf.model.ListLevel; -import org.apache.poi.hwpf.model.ListTables; -import org.apache.poi.hwpf.usermodel.BorderCode; -import org.apache.poi.hwpf.usermodel.CharacterRun; -import org.apache.poi.hwpf.usermodel.Paragraph; -import org.apache.poi.hwpf.usermodel.Range; -import org.apache.poi.hwpf.usermodel.Section; -import org.apache.poi.hwpf.usermodel.SectionProperties; -import org.apache.poi.hwpf.usermodel.TableIterator; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.util.POILogFactory; -import org.apache.poi.util.POILogger; - -public class AbstractWordUtils -{ - static final String EMPTY = ""; - - private static final POILogger logger = POILogFactory - .getLogger( AbstractWordUtils.class ); - - public static final float TWIPS_PER_INCH = 1440.0f; - public static final int TWIPS_PER_PT = 20; - - static void closeQuietly( final Closeable closeable ) - { - try - { - closeable.close(); - } - catch ( Exception exc ) - { - logger.log( POILogger.ERROR, "Unable to close resource: " + exc, - exc ); - } - } - - static boolean equals( String str1, String str2 ) - { - return str1 == null ? str2 == null : str1.equals( str2 ); - } - - // XXX incorporate into Range - static List findCharacterRuns( Range range ) - { - final int min = range.getStartOffset(); - final int max = range.getEndOffset(); - - List result = new ArrayList(); - List chpxs = getCharacters( range ); - for ( int i = 0; i < chpxs.size(); i++ ) - { - CHPX chpx = chpxs.get( i ); - if ( chpx == null ) - continue; - - if ( Math.max( min, chpx.getStart() ) <= Math.min( max, - chpx.getEnd() ) ) - { - final CharacterRun characterRun = getCharacterRun( range, chpx ); - - if ( characterRun == null ) - continue; - - result.add( characterRun ); - } - } - - return result; - } - - public static String getBorderType( BorderCode borderCode ) - { - if ( borderCode == null ) - throw new IllegalArgumentException( "borderCode is null" ); - - switch ( borderCode.getBorderType() ) - { - case 1: - case 2: - return "solid"; - case 3: - return "double"; - case 5: - return "solid"; - case 6: - return "dotted"; - case 7: - case 8: - return "dashed"; - case 9: - return "dotted"; - case 10: - case 11: - case 12: - case 13: - case 14: - case 15: - case 16: - case 17: - case 18: - case 19: - return "double"; - case 20: - return "solid"; - case 21: - return "double"; - case 22: - return "dashed"; - case 23: - return "dashed"; - case 24: - return "ridge"; - case 25: - return "grooved"; - default: - return "solid"; - } - } - - public static String getBorderWidth( BorderCode borderCode ) - { - int lineWidth = borderCode.getLineWidth(); - int pt = lineWidth / 8; - int pte = lineWidth - pt * 8; - - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append( pt ); - stringBuilder.append( "." ); - stringBuilder.append( 1000 / 8 * pte ); - stringBuilder.append( "pt" ); - return stringBuilder.toString(); - } - - public static String getBulletText( ListTables listTables, - Paragraph paragraph, int listId ) - { - final ListLevel listLevel = listTables.getLevel( listId, - paragraph.getIlvl() ); - - if ( listLevel.getNumberText() == null ) - return EMPTY; - - StringBuffer bulletBuffer = new StringBuffer(); - char[] xst = listLevel.getNumberText().toCharArray(); - for ( char element : xst ) - { - if ( element < 9 ) - { - ListLevel numLevel = listTables.getLevel( listId, element ); - - int num = numLevel.getStartAt(); - bulletBuffer.append( NumberFormatter.getNumber( num, - listLevel.getNumberFormat() ) ); - - if ( numLevel == listLevel ) - { - numLevel.setStartAt( numLevel.getStartAt() + 1 ); - } - - } - else - { - bulletBuffer.append( element ); - } - } - - byte follow = getIxchFollow( listLevel ); - switch ( follow ) - { - case 0: - bulletBuffer.append( "\t" ); - break; - case 1: - bulletBuffer.append( " " ); - break; - default: - break; - } - - return bulletBuffer.toString(); - } - - private static CharacterRun getCharacterRun( Range range, CHPX chpx ) - { - try - { - Method method = Range.class.getDeclaredMethod( "getCharacterRun", - CHPX.class ); - method.setAccessible( true ); - return (CharacterRun) method.invoke( range, chpx ); - } - catch ( Exception exc ) - { - throw new Error( exc ); - } - } - - private static List getCharacters( Range range ) - { - try - { - Field field = Range.class.getDeclaredField( "_characters" ); - field.setAccessible( true ); - return (List) field.get( range ); - } - catch ( Exception exc ) - { - throw new Error( exc ); - } - } - - public static String getColor( int ico ) - { - switch ( ico ) - { - case 1: - return "black"; - case 2: - return "blue"; - case 3: - return "cyan"; - case 4: - return "green"; - case 5: - return "magenta"; - case 6: - return "red"; - case 7: - return "yellow"; - case 8: - return "white"; - case 9: - return "darkblue"; - case 10: - return "darkcyan"; - case 11: - return "darkgreen"; - case 12: - return "darkmagenta"; - case 13: - return "darkred"; - case 14: - return "darkyellow"; - case 15: - return "darkgray"; - case 16: - return "lightgray"; - default: - return "black"; - } - } - - public static byte getIxchFollow( ListLevel listLevel ) - { - try - { - Field field = ListLevel.class.getDeclaredField( "_ixchFollow" ); - field.setAccessible( true ); - return ( (Byte) field.get( listLevel ) ).byteValue(); - } - catch ( Exception exc ) - { - throw new Error( exc ); - } - } - - public static String getJustification( int js ) - { - switch ( js ) - { - case 0: - return "start"; - case 1: - return "center"; - case 2: - return "end"; - case 3: - case 4: - return "justify"; - case 5: - return "center"; - case 6: - return "left"; - case 7: - return "start"; - case 8: - return "end"; - case 9: - return "justify"; - } - return ""; - } - - public static String getListItemNumberLabel( int number, int format ) - { - - if ( format != 0 ) - System.err.println( "NYI: toListItemNumberLabel(): " + format ); - - return String.valueOf( number ); - } - - public static SectionProperties getSectionProperties( Section section ) - { - try - { - Field field = Section.class.getDeclaredField( "_props" ); - field.setAccessible( true ); - return (SectionProperties) field.get( section ); - } - catch ( Exception exc ) - { - throw new Error( exc ); - } - } - - static boolean isEmpty( String str ) - { - return str == null || str.length() == 0; - } - - static boolean isNotEmpty( String str ) - { - return !isEmpty( str ); - } - - public static HWPFDocumentCore loadDoc( File docFile ) throws IOException - { - final FileInputStream istream = new FileInputStream( docFile ); - try - { - return loadDoc( istream ); - } - finally - { - closeQuietly( istream ); - } - } - - public static HWPFDocumentCore loadDoc( InputStream inputStream ) - throws IOException - { - final POIFSFileSystem poifsFileSystem = HWPFDocumentCore - .verifyAndBuildPOIFS( inputStream ); - try - { - return new HWPFDocument( poifsFileSystem ); - } - catch ( OldWordFileFormatException exc ) - { - return new HWPFOldDocument( poifsFileSystem ); - } - } - - public static TableIterator newTableIterator( Range range, int level ) - { - try - { - Constructor constructor = TableIterator.class - .getDeclaredConstructor( Range.class, int.class ); - constructor.setAccessible( true ); - return constructor.newInstance( range, Integer.valueOf( level ) ); - } - catch ( Exception exc ) - { - throw new Error( exc ); - } - } - -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java deleted file mode 100644 index 5e474bf254..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java +++ /dev/null @@ -1,201 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Text; - -public class FoDocumentFacade -{ - private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format"; - - protected final Document document; - protected final Element layoutMasterSet; - protected final Element root; - - public FoDocumentFacade( Document document ) - { - this.document = document; - - root = document.createElementNS( NS_XSLFO, "fo:root" ); - document.appendChild( root ); - - layoutMasterSet = document.createElementNS( NS_XSLFO, - "fo:layout-master-set" ); - root.appendChild( layoutMasterSet ); - } - - public Element addFlowToPageSequence( final Element pageSequence, - String flowName ) - { - final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" ); - flow.setAttribute( "flow-name", flowName ); - pageSequence.appendChild( flow ); - - return flow; - } - - public Element addListItem( Element listBlock ) - { - Element result = createListItem(); - listBlock.appendChild( result ); - return result; - } - - public Element addListItemBody( Element listItem ) - { - Element result = createListItemBody(); - listItem.appendChild( result ); - return result; - } - - public Element addListItemLabel( Element listItem, String text ) - { - Element result = createListItemLabel( text ); - listItem.appendChild( result ); - return result; - } - - public Element addPageSequence( String pageMaster ) - { - final Element pageSequence = document.createElementNS( NS_XSLFO, - "fo:page-sequence" ); - pageSequence.setAttribute( "master-reference", pageMaster ); - root.appendChild( pageSequence ); - return pageSequence; - } - - public Element addRegionBody( Element pageMaster ) - { - final Element regionBody = document.createElementNS( NS_XSLFO, - "fo:region-body" ); - pageMaster.appendChild( regionBody ); - - return regionBody; - } - - public Element addSimplePageMaster( String masterName ) - { - final Element simplePageMaster = document.createElementNS( NS_XSLFO, - "fo:simple-page-master" ); - simplePageMaster.setAttribute( "master-name", masterName ); - layoutMasterSet.appendChild( simplePageMaster ); - - return simplePageMaster; - } - - protected Element createBasicLinkExternal( String externalDestination ) - { - final Element basicLink = document.createElementNS( NS_XSLFO, - "fo:basic-link" ); - basicLink.setAttribute( "external-destination", externalDestination ); - return basicLink; - } - - public Element createBasicLinkInternal( String internalDestination ) - { - final Element basicLink = document.createElementNS( NS_XSLFO, - "fo:basic-link" ); - basicLink.setAttribute( "internal-destination", internalDestination ); - return basicLink; - } - - public Element createBlock() - { - return document.createElementNS( NS_XSLFO, "fo:block" ); - } - - public Element createExternalGraphic( String source ) - { - Element result = document.createElementNS( NS_XSLFO, - "fo:external-graphic" ); - result.setAttribute( "src", "url('" + source + "')" ); - return result; - } - - public Element createInline() - { - return document.createElementNS( NS_XSLFO, "fo:inline" ); - } - - public Element createLeader() - { - return document.createElementNS( NS_XSLFO, "fo:leader" ); - } - - public Element createListBlock() - { - return document.createElementNS( NS_XSLFO, "fo:list-block" ); - } - - public Element createListItem() - { - return document.createElementNS( NS_XSLFO, "fo:list-item" ); - } - - public Element createListItemBody() - { - return document.createElementNS( NS_XSLFO, "fo:list-item-body" ); - } - - public Element createListItemLabel( String text ) - { - Element result = document.createElementNS( NS_XSLFO, - "fo:list-item-label" ); - Element block = createBlock(); - block.appendChild( document.createTextNode( text ) ); - result.appendChild( block ); - return result; - } - - protected Element createTable() - { - return document.createElementNS( NS_XSLFO, "fo:table" ); - } - - protected Element createTableBody() - { - return document.createElementNS( NS_XSLFO, "fo:table-body" ); - } - - protected Element createTableCell() - { - return document.createElementNS( NS_XSLFO, "fo:table-cell" ); - } - - protected Element createTableHeader() - { - return document.createElementNS( NS_XSLFO, "fo:table-header" ); - } - - protected Element createTableRow() - { - return document.createElementNS( NS_XSLFO, "fo:table-row" ); - } - - protected Text createText( String data ) - { - return document.createTextNode( data ); - } - - public Document getDocument() - { - return document; - } - -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java deleted file mode 100644 index 5e2b1f0166..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java +++ /dev/null @@ -1,107 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Text; - -public class HtmlDocumentFacade -{ - - protected final Element body; - protected final Document document; - protected final Element head; - protected final Element html; - - public HtmlDocumentFacade( Document document ) - { - this.document = document; - - html = document.createElement( "html" ); - document.appendChild( html ); - - body = document.createElement( "body" ); - head = document.createElement( "head" ); - - html.appendChild( head ); - html.appendChild( body ); - } - - public Element createHyperlink( String internalDestination ) - { - final Element basicLink = document.createElement( "a" ); - basicLink.setAttribute( "href", internalDestination ); - return basicLink; - } - - public Element createListItem() - { - return document.createElement( "li" ); - } - - public Element createParagraph() - { - return document.createElement( "p" ); - } - - public Element createTable() - { - return document.createElement( "table" ); - } - - public Element createTableBody() - { - return document.createElement( "tbody" ); - } - - public Element createTableCell() - { - return document.createElement( "td" ); - } - - public Element createTableHeader() - { - return document.createElement( "thead" ); - } - - public Element createTableHeaderCell() - { - return document.createElement( "th" ); - } - - public Element createTableRow() - { - return document.createElement( "tr" ); - } - - public Text createText( String data ) - { - return document.createTextNode( data ); - } - - public Element createUnorderedList() - { - return document.createElement( "ul" ); - } - - public Document getDocument() - { - return document; - } - -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/NumberFormatter.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/NumberFormatter.java deleted file mode 100644 index d4a2cc726b..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/NumberFormatter.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * ==================================================================== - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ==================================================================== - */ - -package org.apache.poi.hwpf.extractor; - -/** - * Comment me - * - * @author Ryan Ackley - */ -public final class NumberFormatter { - - private static String[] C_LETTERS = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i", - "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "x", "y", "z" }; - - private static String[] C_ROMAN = new String[] { "i", "ii", "iii", "iv", "v", "vi", "vii", - "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", - "xx", "xxi", "xxii", "xxiii", "xxiv", "xxv", "xxvi", "xxvii", "xxviii", "xxix", "xxx", - "xxxi", "xxxii", "xxxiii", "xxxiv", "xxxv", "xxxvi", "xxxvii", "xxxvii", "xxxviii", - "xxxix", "xl", "xli", "xlii", "xliii", "xliv", "xlv", "xlvi", "xlvii", "xlviii", - "xlix", "l" }; - - private final static int T_ARABIC = 0; - private final static int T_LOWER_LETTER = 4; - private final static int T_LOWER_ROMAN = 2; - private final static int T_ORDINAL = 5; - private final static int T_UPPER_LETTER = 3; - private final static int T_UPPER_ROMAN = 1; - - public static String getNumber(int num, int style) { - switch (style) { - case T_UPPER_ROMAN: - return C_ROMAN[num - 1].toUpperCase(); - case T_LOWER_ROMAN: - return C_ROMAN[num - 1]; - case T_UPPER_LETTER: - return C_LETTERS[num - 1].toUpperCase(); - case T_LOWER_LETTER: - return C_LETTERS[num - 1]; - case T_ARABIC: - case T_ORDINAL: - default: - return String.valueOf(num); - } - } -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java deleted file mode 100644 index 67f6bb17d1..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java +++ /dev/null @@ -1,627 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import java.io.File; -import java.io.FileWriter; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Stack; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - -import org.apache.poi.hwpf.HWPFDocument; -import org.apache.poi.hwpf.HWPFDocumentCore; -import org.apache.poi.hwpf.model.ListFormatOverride; -import org.apache.poi.hwpf.model.ListTables; -import org.apache.poi.hwpf.usermodel.BorderCode; -import org.apache.poi.hwpf.usermodel.CharacterRun; -import org.apache.poi.hwpf.usermodel.Paragraph; -import org.apache.poi.hwpf.usermodel.Picture; -import org.apache.poi.hwpf.usermodel.Range; -import org.apache.poi.hwpf.usermodel.Section; -import org.apache.poi.hwpf.usermodel.SectionProperties; -import org.apache.poi.hwpf.usermodel.Table; -import org.apache.poi.hwpf.usermodel.TableCell; -import org.apache.poi.hwpf.usermodel.TableIterator; -import org.apache.poi.hwpf.usermodel.TableRow; -import org.apache.poi.util.POILogFactory; -import org.apache.poi.util.POILogger; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Text; - -/** - * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) - */ -public class WordToFoExtractor extends AbstractWordExtractor -{ - - /** - * Holds properties values, applied to current fo:block element. - * Those properties shall not be doubled in children fo:inline - * elements. - */ - private static class BlockProperies - { - final boolean pBold; - final String pFontName; - final int pFontSize; - final boolean pItalic; - - public BlockProperies( String pFontName, int pFontSize, boolean pBold, - boolean pItalic ) - { - this.pFontName = pFontName; - this.pFontSize = pFontSize; - this.pBold = pBold; - this.pItalic = pItalic; - } - } - - private static final POILogger logger = POILogFactory - .getLogger( WordToFoExtractor.class ); - - public static String getBorderType( BorderCode borderCode ) - { - if ( borderCode == null ) - throw new IllegalArgumentException( "borderCode is null" ); - - switch ( borderCode.getBorderType() ) - { - case 1: - case 2: - return "solid"; - case 3: - return "double"; - case 5: - return "solid"; - case 6: - return "dotted"; - case 7: - case 8: - return "dashed"; - case 9: - return "dotted"; - case 10: - case 11: - case 12: - case 13: - case 14: - case 15: - case 16: - case 17: - case 18: - case 19: - return "double"; - case 20: - return "solid"; - case 21: - return "double"; - case 22: - return "dashed"; - case 23: - return "dashed"; - case 24: - return "ridge"; - case 25: - return "grooved"; - default: - return "solid"; - } - } - - /** - * Java main() interface to interact with WordToFoExtractor - * - *

- * Usage: WordToFoExtractor infile outfile - *

- * Where infile is an input .doc file ( Word 97-2007) which will be rendered - * as XSL-FO into outfile - * - */ - public static void main( String[] args ) - { - if ( args.length < 2 ) - { - System.err - .println( "Usage: WordToFoExtractor " ); - return; - } - - System.out.println( "Converting " + args[0] ); - System.out.println( "Saving output to " + args[1] ); - try - { - Document doc = WordToFoExtractor.process( new File( args[0] ) ); - - FileWriter out = new FileWriter( args[1] ); - DOMSource domSource = new DOMSource( doc ); - StreamResult streamResult = new StreamResult( out ); - TransformerFactory tf = TransformerFactory.newInstance(); - Transformer serializer = tf.newTransformer(); - // TODO set encoding from a command argument - serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" ); - serializer.setOutputProperty( OutputKeys.INDENT, "yes" ); - serializer.transform( domSource, streamResult ); - out.close(); - } - catch ( Exception e ) - { - e.printStackTrace(); - } - } - - static Document process( File docFile ) throws Exception - { - final HWPFDocumentCore hwpfDocument = WordToFoUtils.loadDoc( docFile ); - WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( - DocumentBuilderFactory.newInstance().newDocumentBuilder() - .newDocument() ); - wordToFoExtractor.processDocument( hwpfDocument ); - return wordToFoExtractor.getDocument(); - } - - private final Stack blocksProperies = new Stack(); - - protected final FoDocumentFacade foDocumentFacade; - - /** - * Creates new instance of {@link WordToFoExtractor}. Can be used for output - * several {@link HWPFDocument}s into single FO document. - * - * @param document - * XML DOM Document used as XSL FO document. Shall support - * namespaces - */ - public WordToFoExtractor( Document document ) - { - this.foDocumentFacade = new FoDocumentFacade( document ); - } - - protected String createPageMaster( SectionProperties sep, String type, - int section ) - { - float height = sep.getYaPage() / WordToFoUtils.TWIPS_PER_INCH; - float width = sep.getXaPage() / WordToFoUtils.TWIPS_PER_INCH; - float leftMargin = sep.getDxaLeft() / WordToFoUtils.TWIPS_PER_INCH; - float rightMargin = sep.getDxaRight() / WordToFoUtils.TWIPS_PER_INCH; - float topMargin = sep.getDyaTop() / WordToFoUtils.TWIPS_PER_INCH; - float bottomMargin = sep.getDyaBottom() / WordToFoUtils.TWIPS_PER_INCH; - - // add these to the header - String pageMasterName = type + "-page" + section; - - Element pageMaster = foDocumentFacade - .addSimplePageMaster( pageMasterName ); - pageMaster.setAttribute( "page-height", height + "in" ); - pageMaster.setAttribute( "page-width", width + "in" ); - - Element regionBody = foDocumentFacade.addRegionBody( pageMaster ); - regionBody.setAttribute( "margin", topMargin + "in " + rightMargin - + "in " + bottomMargin + "in " + leftMargin + "in" ); - - /* - * 6.4.14 fo:region-body - * - * The values of the padding and border-width traits must be "0". - */ - // WordToFoUtils.setBorder(regionBody, sep.getBrcTop(), "top"); - // WordToFoUtils.setBorder(regionBody, sep.getBrcBottom(), "bottom"); - // WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left"); - // WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right"); - - if ( sep.getCcolM1() > 0 ) - { - regionBody.setAttribute( "column-count", "" - + ( sep.getCcolM1() + 1 ) ); - if ( sep.getFEvenlySpaced() ) - { - regionBody.setAttribute( "column-gap", - ( sep.getDxaColumns() / WordToFoUtils.TWIPS_PER_INCH ) - + "in" ); - } - else - { - regionBody.setAttribute( "column-gap", "0.25in" ); - } - } - - return pageMasterName; - } - - public Document getDocument() - { - return foDocumentFacade.getDocument(); - } - - @Override - protected void outputCharacters( Element block, CharacterRun characterRun, - String text ) - { - BlockProperies blockProperies = this.blocksProperies.peek(); - Element inline = foDocumentFacade.createInline(); - if ( characterRun.isBold() != blockProperies.pBold ) - { - WordToFoUtils.setBold( inline, characterRun.isBold() ); - } - if ( characterRun.isItalic() != blockProperies.pItalic ) - { - WordToFoUtils.setItalic( inline, characterRun.isItalic() ); - } - if ( characterRun.getFontName() != null - && !AbstractWordUtils.equals( characterRun.getFontName(), - blockProperies.pFontName ) ) - { - WordToFoUtils.setFontFamily( inline, characterRun.getFontName() ); - } - if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize ) - { - WordToFoUtils.setFontSize( inline, characterRun.getFontSize() / 2 ); - } - WordToFoUtils.setCharactersProperties( characterRun, inline ); - block.appendChild( inline ); - - Text textNode = foDocumentFacade.createText( text ); - inline.appendChild( textNode ); - } - - protected void processHyperlink( HWPFDocumentCore hwpfDocument, - Element currentBlock, Paragraph paragraph, - List characterRuns, int currentTableLevel, - String hyperlink, int beginTextInclusive, int endTextExclusive ) - { - Element basicLink = foDocumentFacade - .createBasicLinkExternal( hyperlink ); - currentBlock.appendChild( basicLink ); - - if ( beginTextInclusive < endTextExclusive ) - processCharacters( hwpfDocument, currentTableLevel, paragraph, - basicLink, characterRuns, beginTextInclusive, - endTextExclusive ); - } - - /** - * This method shall store image bytes in external file and convert it if - * necessary. Images shall be stored using PNG format (for bitmap) or SVG - * (for vector). Other formats may be not supported by your XSL FO - * processor. - *

- * Please note the - * {@link WordToFoUtils#setPictureProperties(Picture, Element)} method. - * - * @param currentBlock - * currently processed FO element, like fo:block. Shall - * be used as parent of newly created - * fo:external-graphic or - * fo:instream-foreign-object - * @param inlined - * if image is inlined - * @param picture - * HWPF object, contained picture data and properties - */ - protected void processImage( Element currentBlock, boolean inlined, - Picture picture ) - { - // no default implementation -- skip - currentBlock.appendChild( foDocumentFacade.getDocument().createComment( - "Image link to '" + picture.suggestFullFileName() - + "' can be here" ) ); - } - - protected void processPageref( HWPFDocumentCore hwpfDocument, - Element currentBlock, Paragraph paragraph, - List characterRuns, int currentTableLevel, - String pageref, int beginTextInclusive, int endTextExclusive ) - { - Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref ); - currentBlock.appendChild( basicLink ); - - if ( beginTextInclusive < endTextExclusive ) - processCharacters( hwpfDocument, currentTableLevel, paragraph, - basicLink, characterRuns, beginTextInclusive, - endTextExclusive ); - } - - protected void processParagraph( HWPFDocumentCore hwpfDocument, - Element parentFopElement, int currentTableLevel, - Paragraph paragraph, String bulletText ) - { - final Element block = foDocumentFacade.createBlock(); - parentFopElement.appendChild( block ); - - WordToFoUtils.setParagraphProperties( paragraph, block ); - - final int charRuns = paragraph.numCharacterRuns(); - - if ( charRuns == 0 ) - { - return; - } - - { - final String pFontName; - final int pFontSize; - final boolean pBold; - final boolean pItalic; - { - CharacterRun characterRun = paragraph.getCharacterRun( 0 ); - pFontSize = characterRun.getFontSize() / 2; - pFontName = characterRun.getFontName(); - pBold = characterRun.isBold(); - pItalic = characterRun.isItalic(); - } - WordToFoUtils.setFontFamily( block, pFontName ); - WordToFoUtils.setFontSize( block, pFontSize ); - WordToFoUtils.setBold( block, pBold ); - WordToFoUtils.setItalic( block, pItalic ); - - blocksProperies.push( new BlockProperies( pFontName, pFontSize, - pBold, pItalic ) ); - } - try - { - boolean haveAnyText = false; - - if ( WordToFoUtils.isNotEmpty( bulletText ) ) - { - Element inline = foDocumentFacade.createInline(); - block.appendChild( inline ); - - Text textNode = foDocumentFacade.createText( bulletText ); - inline.appendChild( textNode ); - - haveAnyText |= bulletText.trim().length() != 0; - } - - List characterRuns = WordToFoUtils - .findCharacterRuns( paragraph ); - haveAnyText = processCharacters( hwpfDocument, currentTableLevel, - paragraph, block, characterRuns, 0, characterRuns.size() ); - - if ( !haveAnyText ) - { - Element leader = foDocumentFacade.createLeader(); - block.appendChild( leader ); - } - } - finally - { - blocksProperies.pop(); - } - - return; - } - - protected void processSection( HWPFDocumentCore wordDocument, - Section section, int sectionCounter ) - { - String regularPage = createPageMaster( - WordToFoUtils.getSectionProperties( section ), "page", - sectionCounter ); - - Element pageSequence = foDocumentFacade.addPageSequence( regularPage ); - Element flow = foDocumentFacade.addFlowToPageSequence( pageSequence, - "xsl-region-body" ); - - processSectionParagraphes( wordDocument, flow, section, 0 ); - } - - protected void processSectionParagraphes( HWPFDocument wordDocument, - Element flow, Range range, int currentTableLevel ) - { - final Map allTables = new HashMap(); - for ( TableIterator tableIterator = WordToFoUtils.newTableIterator( - range, currentTableLevel + 1 ); tableIterator.hasNext(); ) - { - Table next = tableIterator.next(); - allTables.put( Integer.valueOf( next.getStartOffset() ), next ); - } - - final ListTables listTables = wordDocument.getListTables(); - int currentListInfo = 0; - - final int paragraphs = range.numParagraphs(); - for ( int p = 0; p < paragraphs; p++ ) - { - Paragraph paragraph = range.getParagraph( p ); - - if ( allTables.containsKey( Integer.valueOf( paragraph - .getStartOffset() ) ) ) - { - Table table = allTables.get( Integer.valueOf( paragraph - .getStartOffset() ) ); - processTable( wordDocument, flow, table, currentTableLevel + 1 ); - continue; - } - - if ( paragraph.isInTable() - && paragraph.getTableLevel() != currentTableLevel ) - { - continue; - } - - if ( paragraph.getIlfo() != currentListInfo ) - { - currentListInfo = paragraph.getIlfo(); - } - - if ( currentListInfo != 0 ) - { - if ( listTables != null ) - { - final ListFormatOverride listFormatOverride = listTables - .getOverride( paragraph.getIlfo() ); - - String label = WordToFoUtils.getBulletText( listTables, - paragraph, listFormatOverride.getLsid() ); - - processParagraph( wordDocument, flow, currentTableLevel, - paragraph, label ); - } - else - { - logger.log( POILogger.WARN, - "Paragraph #" + paragraph.getStartOffset() + "-" - + paragraph.getEndOffset() - + " has reference to list structure #" - + currentListInfo - + ", but listTables not defined in file" ); - - processParagraph( wordDocument, flow, currentTableLevel, - paragraph, WordToFoUtils.EMPTY ); - } - } - else - { - processParagraph( wordDocument, flow, currentTableLevel, - paragraph, WordToFoUtils.EMPTY ); - } - } - - } - - protected void processTable( HWPFDocumentCore wordDocument, Element flow, - Table table, int thisTableLevel ) - { - Element tableHeader = foDocumentFacade.createTableHeader(); - Element tableBody = foDocumentFacade.createTableBody(); - - final int tableRows = table.numRows(); - - int maxColumns = Integer.MIN_VALUE; - for ( int r = 0; r < tableRows; r++ ) - { - maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() ); - } - - for ( int r = 0; r < tableRows; r++ ) - { - TableRow tableRow = table.getRow( r ); - - Element tableRowElement = foDocumentFacade.createTableRow(); - WordToFoUtils.setTableRowProperties( tableRow, tableRowElement ); - - final int rowCells = tableRow.numCells(); - for ( int c = 0; c < rowCells; c++ ) - { - TableCell tableCell = tableRow.getCell( c ); - - if ( tableCell.isMerged() && !tableCell.isFirstMerged() ) - continue; - - if ( tableCell.isVerticallyMerged() - && !tableCell.isFirstVerticallyMerged() ) - continue; - - Element tableCellElement = foDocumentFacade.createTableCell(); - WordToFoUtils.setTableCellProperties( tableRow, tableCell, - tableCellElement, r == 0, r == tableRows - 1, c == 0, - c == rowCells - 1 ); - - if ( tableCell.isFirstMerged() ) - { - int count = 0; - for ( int c1 = c; c1 < rowCells; c1++ ) - { - TableCell nextCell = tableRow.getCell( c1 ); - if ( nextCell.isMerged() ) - count++; - if ( !nextCell.isMerged() ) - break; - } - tableCellElement.setAttribute( "number-columns-spanned", "" - + count ); - } - else - { - if ( c == rowCells - 1 && c != maxColumns - 1 ) - { - tableCellElement.setAttribute( - "number-columns-spanned", "" - + ( maxColumns - c ) ); - } - } - - if ( tableCell.isFirstVerticallyMerged() ) - { - int count = 0; - for ( int r1 = r; r1 < tableRows; r1++ ) - { - TableRow nextRow = table.getRow( r1 ); - if ( nextRow.numCells() < c ) - break; - TableCell nextCell = nextRow.getCell( c ); - if ( nextCell.isVerticallyMerged() ) - count++; - if ( !nextCell.isVerticallyMerged() ) - break; - } - tableCellElement.setAttribute( "number-rows-spanned", "" - + count ); - } - - processSectionParagraphes( wordDocument, tableCellElement, - tableCell, thisTableLevel ); - - if ( !tableCellElement.hasChildNodes() ) - { - tableCellElement.appendChild( foDocumentFacade - .createBlock() ); - } - - tableRowElement.appendChild( tableCellElement ); - } - - if ( tableRow.isTableHeader() ) - { - tableHeader.appendChild( tableRowElement ); - } - else - { - tableBody.appendChild( tableRowElement ); - } - } - - final Element tableElement = foDocumentFacade.createTable(); - if ( tableHeader.hasChildNodes() ) - { - tableElement.appendChild( tableHeader ); - } - if ( tableBody.hasChildNodes() ) - { - tableElement.appendChild( tableBody ); - flow.appendChild( tableElement ); - } - else - { - logger.log( - POILogger.WARN, - "Table without body starting on offset " - + table.getStartOffset() + " -- " - + table.getEndOffset() ); - } - } - -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java deleted file mode 100644 index 1b3447f006..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java +++ /dev/null @@ -1,323 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import org.apache.poi.hwpf.usermodel.BorderCode; -import org.apache.poi.hwpf.usermodel.CharacterProperties; -import org.apache.poi.hwpf.usermodel.CharacterRun; -import org.apache.poi.hwpf.usermodel.Paragraph; -import org.apache.poi.hwpf.usermodel.Picture; -import org.apache.poi.hwpf.usermodel.TableCell; -import org.apache.poi.hwpf.usermodel.TableRow; -import org.w3c.dom.Element; - -public class WordToFoUtils extends AbstractWordUtils -{ - public static void setBold( final Element element, final boolean bold ) - { - element.setAttribute( "font-weight", bold ? "bold" : "normal" ); - } - - public static void setBorder( Element element, BorderCode borderCode, - String where ) - { - if ( element == null ) - throw new IllegalArgumentException( "element is null" ); - - if ( borderCode == null || borderCode.getBorderType() == 0 ) - return; - - if ( isEmpty( where ) ) - { - element.setAttribute( "border-style", getBorderType( borderCode ) ); - element.setAttribute( "border-color", - getColor( borderCode.getColor() ) ); - element.setAttribute( "border-width", getBorderWidth( borderCode ) ); - } - else - { - element.setAttribute( "border-" + where + "-style", - getBorderType( borderCode ) ); - element.setAttribute( "border-" + where + "-color", - getColor( borderCode.getColor() ) ); - element.setAttribute( "border-" + where + "-width", - getBorderWidth( borderCode ) ); - } - } - - public static void setCharactersProperties( - final CharacterRun characterRun, final Element inline ) - { - final CharacterProperties clonedProperties = characterRun - .cloneProperties(); - StringBuilder textDecorations = new StringBuilder(); - - setBorder( inline, clonedProperties.getBrc(), EMPTY ); - - if ( characterRun.isCapitalized() ) - { - inline.setAttribute( "text-transform", "uppercase" ); - } - if ( characterRun.isHighlighted() ) - { - inline.setAttribute( "background-color", - getColor( clonedProperties.getIcoHighlight() ) ); - } - if ( characterRun.isStrikeThrough() ) - { - if ( textDecorations.length() > 0 ) - textDecorations.append( " " ); - textDecorations.append( "line-through" ); - } - if ( characterRun.isShadowed() ) - { - inline.setAttribute( "text-shadow", characterRun.getFontSize() / 24 - + "pt" ); - } - if ( characterRun.isSmallCaps() ) - { - inline.setAttribute( "font-variant", "small-caps" ); - } - if ( characterRun.getSubSuperScriptIndex() == 1 ) - { - inline.setAttribute( "baseline-shift", "super" ); - inline.setAttribute( "font-size", "smaller" ); - } - if ( characterRun.getSubSuperScriptIndex() == 2 ) - { - inline.setAttribute( "baseline-shift", "sub" ); - inline.setAttribute( "font-size", "smaller" ); - } - if ( characterRun.getUnderlineCode() > 0 ) - { - if ( textDecorations.length() > 0 ) - textDecorations.append( " " ); - textDecorations.append( "underline" ); - } - if ( characterRun.isVanished() ) - { - inline.setAttribute( "visibility", "hidden" ); - } - if ( textDecorations.length() > 0 ) - { - inline.setAttribute( "text-decoration", textDecorations.toString() ); - } - } - - public static void setFontFamily( final Element element, - final String fontFamily ) - { - if ( isEmpty( fontFamily ) ) - return; - - element.setAttribute( "font-family", fontFamily ); - } - - public static void setFontSize( final Element element, final int fontSize ) - { - element.setAttribute( "font-size", String.valueOf( fontSize ) ); - } - - public static void setIndent( Paragraph paragraph, Element block ) - { - if ( paragraph.getFirstLineIndent() != 0 ) - { - block.setAttribute( - "text-indent", - String.valueOf( paragraph.getFirstLineIndent() - / TWIPS_PER_PT ) - + "pt" ); - } - if ( paragraph.getIndentFromLeft() != 0 ) - { - block.setAttribute( - "start-indent", - String.valueOf( paragraph.getIndentFromLeft() - / TWIPS_PER_PT ) - + "pt" ); - } - if ( paragraph.getIndentFromRight() != 0 ) - { - block.setAttribute( - "end-indent", - String.valueOf( paragraph.getIndentFromRight() - / TWIPS_PER_PT ) - + "pt" ); - } - if ( paragraph.getSpacingBefore() != 0 ) - { - block.setAttribute( - "space-before", - String.valueOf( paragraph.getSpacingBefore() / TWIPS_PER_PT ) - + "pt" ); - } - if ( paragraph.getSpacingAfter() != 0 ) - { - block.setAttribute( "space-after", - String.valueOf( paragraph.getSpacingAfter() / TWIPS_PER_PT ) - + "pt" ); - } - } - - public static void setItalic( final Element element, final boolean italic ) - { - element.setAttribute( "font-style", italic ? "italic" : "normal" ); - } - - public static void setJustification( Paragraph paragraph, - final Element element ) - { - String justification = getJustification( paragraph.getJustification() ); - if ( isNotEmpty( justification ) ) - element.setAttribute( "text-align", justification ); - } - - public static void setParagraphProperties( Paragraph paragraph, - Element block ) - { - setIndent( paragraph, block ); - setJustification( paragraph, block ); - - setBorder( block, paragraph.getBottomBorder(), "bottom" ); - setBorder( block, paragraph.getLeftBorder(), "left" ); - setBorder( block, paragraph.getRightBorder(), "right" ); - setBorder( block, paragraph.getTopBorder(), "top" ); - - if ( paragraph.pageBreakBefore() ) - { - block.setAttribute( "break-before", "page" ); - } - - block.setAttribute( "hyphenate", - String.valueOf( paragraph.isAutoHyphenated() ) ); - - if ( paragraph.keepOnPage() ) - { - block.setAttribute( "keep-together.within-page", "always" ); - } - - if ( paragraph.keepWithNext() ) - { - block.setAttribute( "keep-with-next.within-page", "always" ); - } - - block.setAttribute( "linefeed-treatment", "preserve" ); - block.setAttribute( "white-space-collapse", "false" ); - } - - public static void setPictureProperties( Picture picture, - Element graphicElement ) - { - final int aspectRatioX = picture.getAspectRatioX(); - final int aspectRatioY = picture.getAspectRatioY(); - - if ( aspectRatioX > 0 ) - { - graphicElement - .setAttribute( "content-width", ( ( picture.getDxaGoal() - * aspectRatioX / 100 ) / TWIPS_PER_PT ) - + "pt" ); - } - else - graphicElement.setAttribute( "content-width", - ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" ); - - if ( aspectRatioY > 0 ) - graphicElement - .setAttribute( "content-height", ( ( picture.getDyaGoal() - * aspectRatioY / 100 ) / TWIPS_PER_PT ) - + "pt" ); - else - graphicElement.setAttribute( "content-height", - ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" ); - - if ( aspectRatioX <= 0 || aspectRatioY <= 0 ) - { - graphicElement.setAttribute( "scaling", "uniform" ); - } - else - { - graphicElement.setAttribute( "scaling", "non-uniform" ); - } - - graphicElement.setAttribute( "vertical-align", "text-bottom" ); - - if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0 - || picture.getDyaCropBottom() != 0 - || picture.getDxaCropLeft() != 0 ) - { - int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT; - int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT; - int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT; - int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT; - graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, " - + rectRight + "pt, " + rectBottom + "pt, " + rectLeft - + "pt)" ); - graphicElement.setAttribute( "oveerflow", "hidden" ); - } - } - - public static void setTableCellProperties( TableRow tableRow, - TableCell tableCell, Element element, boolean toppest, - boolean bottomest, boolean leftest, boolean rightest ) - { - element.setAttribute( "width", ( tableCell.getWidth() / TWIPS_PER_INCH ) - + "in" ); - element.setAttribute( "padding-start", - ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" ); - element.setAttribute( "padding-end", - ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" ); - - BorderCode top = tableCell.getBrcTop() != null - && tableCell.getBrcTop().getBorderType() != 0 ? tableCell - .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow - .getHorizontalBorder(); - BorderCode bottom = tableCell.getBrcBottom() != null - && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell - .getBrcBottom() : bottomest ? tableRow.getBottomBorder() - : tableRow.getHorizontalBorder(); - - BorderCode left = tableCell.getBrcLeft() != null - && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell - .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow - .getVerticalBorder(); - BorderCode right = tableCell.getBrcRight() != null - && tableCell.getBrcRight().getBorderType() != 0 ? tableCell - .getBrcRight() : rightest ? tableRow.getRightBorder() - : tableRow.getVerticalBorder(); - - setBorder( element, bottom, "bottom" ); - setBorder( element, left, "left" ); - setBorder( element, right, "right" ); - setBorder( element, top, "top" ); - } - - public static void setTableRowProperties( TableRow tableRow, - Element tableRowElement ) - { - if ( tableRow.getRowHeight() > 0 ) - { - tableRowElement.setAttribute( "height", - ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in" ); - } - if ( !tableRow.cantSplit() ) - { - tableRowElement.setAttribute( "keep-together", "always" ); - } - } - -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java deleted file mode 100644 index 6f27e443c8..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java +++ /dev/null @@ -1,475 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import java.io.File; -import java.io.FileWriter; -import java.util.List; -import java.util.Stack; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - -import org.apache.poi.hwpf.HWPFDocument; -import org.apache.poi.hwpf.HWPFDocumentCore; -import org.apache.poi.hwpf.usermodel.CharacterRun; -import org.apache.poi.hwpf.usermodel.Paragraph; -import org.apache.poi.hwpf.usermodel.Picture; -import org.apache.poi.hwpf.usermodel.Section; -import org.apache.poi.hwpf.usermodel.SectionProperties; -import org.apache.poi.hwpf.usermodel.Table; -import org.apache.poi.hwpf.usermodel.TableCell; -import org.apache.poi.hwpf.usermodel.TableRow; -import org.apache.poi.util.POILogFactory; -import org.apache.poi.util.POILogger; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Text; - -import static org.apache.poi.hwpf.extractor.AbstractWordUtils.TWIPS_PER_INCH; - -/** - * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) - */ -public class WordToHtmlExtractor extends AbstractWordExtractor -{ - - /** - * Holds properties values, applied to current p element. Those - * properties shall not be doubled in children span elements. - */ - private static class BlockProperies - { - final String pFontName; - final int pFontSize; - - public BlockProperies( String pFontName, int pFontSize ) - { - this.pFontName = pFontName; - this.pFontSize = pFontSize; - } - } - - private static final POILogger logger = POILogFactory - .getLogger( WordToHtmlExtractor.class ); - - private static String getSectionStyle( Section section ) - { - SectionProperties sep = WordToHtmlUtils.getSectionProperties( section ); - - float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH; - float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH; - float topMargin = sep.getDyaTop() / TWIPS_PER_INCH; - float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH; - - String style = "margin: " + topMargin + "in " + rightMargin + "in " - + bottomMargin + "in " + leftMargin + "in; "; - - if ( sep.getCcolM1() > 0 ) - { - style += "column-count: " + ( sep.getCcolM1() + 1 ) + "; "; - if ( sep.getFEvenlySpaced() ) - { - style += "column-gap: " - + ( sep.getDxaColumns() / TWIPS_PER_INCH ) + "in; "; - } - else - { - style += "column-gap: 0.25in; "; - } - } - return style; - } - - /** - * Java main() interface to interact with WordToHtmlExtractor - * - *

- * Usage: WordToHtmlExtractor infile outfile - *

- * Where infile is an input .doc file ( Word 95-2007) which will be rendered - * as HTML into outfile - */ - public static void main( String[] args ) - { - if ( args.length < 2 ) - { - System.err - .println( "Usage: WordToHtmlExtractor " ); - return; - } - - System.out.println( "Converting " + args[0] ); - System.out.println( "Saving output to " + args[1] ); - try - { - Document doc = WordToHtmlExtractor.process( new File( args[0] ) ); - - FileWriter out = new FileWriter( args[1] ); - DOMSource domSource = new DOMSource( doc ); - StreamResult streamResult = new StreamResult( out ); - - TransformerFactory tf = TransformerFactory.newInstance(); - Transformer serializer = tf.newTransformer(); - // TODO set encoding from a command argument - serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" ); - serializer.setOutputProperty( OutputKeys.INDENT, "yes" ); - serializer.setOutputProperty( OutputKeys.METHOD, "html" ); - serializer.transform( domSource, streamResult ); - out.close(); - } - catch ( Exception e ) - { - e.printStackTrace(); - } - } - - static Document process( File docFile ) throws Exception - { - final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile ); - WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor( - DocumentBuilderFactory.newInstance().newDocumentBuilder() - .newDocument() ); - wordToHtmlExtractor.processDocument( wordDocument ); - return wordToHtmlExtractor.getDocument(); - } - - private final Stack blocksProperies = new Stack(); - - private final HtmlDocumentFacade htmlDocumentFacade; - - /** - * Creates new instance of {@link WordToHtmlExtractor}. Can be used for - * output several {@link HWPFDocument}s into single HTML document. - * - * @param document - * XML DOM Document used as HTML document - */ - public WordToHtmlExtractor( Document document ) - { - this.htmlDocumentFacade = new HtmlDocumentFacade( document ); - } - - public Document getDocument() - { - return htmlDocumentFacade.getDocument(); - } - - @Override - protected void outputCharacters( Element pElement, - CharacterRun characterRun, String text ) - { - Element span = htmlDocumentFacade.document.createElement( "span" ); - pElement.appendChild( span ); - - StringBuilder style = new StringBuilder(); - BlockProperies blockProperies = this.blocksProperies.peek(); - if ( characterRun.getFontName() != null - && !WordToHtmlUtils.equals( characterRun.getFontName(), - blockProperies.pFontName ) ) - { - style.append( "font-family: " + characterRun.getFontName() + "; " ); - } - if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize ) - { - style.append( "font-size: " + characterRun.getFontSize() / 2 + "; " ); - } - - WordToHtmlUtils.addCharactersProperties( characterRun, style ); - if ( style.length() != 0 ) - span.setAttribute( "style", style.toString() ); - - Text textNode = htmlDocumentFacade.createText( text ); - span.appendChild( textNode ); - } - - protected void processHyperlink( HWPFDocumentCore wordDocument, - Element currentBlock, Paragraph paragraph, - List characterRuns, int currentTableLevel, - String hyperlink, int beginTextInclusive, int endTextExclusive ) - { - Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink ); - currentBlock.appendChild( basicLink ); - - if ( beginTextInclusive < endTextExclusive ) - processCharacters( wordDocument, currentTableLevel, paragraph, - basicLink, characterRuns, beginTextInclusive, - endTextExclusive ); - } - - /** - * This method shall store image bytes in external file and convert it if - * necessary. Images shall be stored using PNG format. Other formats may be - * not supported by user browser. - *

- * Please note the - * {@link WordToHtmlUtils#setPictureProperties(Picture, Element)} method. - * - * @param currentBlock - * currently processed HTML element, like p. Shall be - * used as parent of newly created img - * @param inlined - * if image is inlined - * @param picture - * HWPF object, contained picture data and properties - */ - protected void processImage( Element currentBlock, boolean inlined, - Picture picture ) - { - // no default implementation -- skip - currentBlock.appendChild( htmlDocumentFacade.document - .createComment( "Image link to '" - + picture.suggestFullFileName() + "' can be here" ) ); - } - - protected void processPageref( HWPFDocumentCore hwpfDocument, - Element currentBlock, Paragraph paragraph, - List characterRuns, int currentTableLevel, - String pageref, int beginTextInclusive, int endTextExclusive ) - { - Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref ); - currentBlock.appendChild( basicLink ); - - if ( beginTextInclusive < endTextExclusive ) - processCharacters( hwpfDocument, currentTableLevel, paragraph, - basicLink, characterRuns, beginTextInclusive, - endTextExclusive ); - } - - protected void processParagraph( HWPFDocumentCore hwpfDocument, - Element parentFopElement, int currentTableLevel, - Paragraph paragraph, String bulletText ) - { - final Element pElement = htmlDocumentFacade.createParagraph(); - parentFopElement.appendChild( pElement ); - - StringBuilder style = new StringBuilder(); - WordToHtmlUtils.addParagraphProperties( paragraph, style ); - - final int charRuns = paragraph.numCharacterRuns(); - - if ( charRuns == 0 ) - { - return; - } - - { - final String pFontName; - final int pFontSize; - final CharacterRun characterRun = paragraph.getCharacterRun( 0 ); - if ( characterRun != null ) - { - pFontSize = characterRun.getFontSize() / 2; - pFontName = characterRun.getFontName(); - WordToHtmlUtils.addFontFamily( pFontName, style ); - WordToHtmlUtils.addFontSize( pFontSize, style ); - } - else - { - pFontSize = -1; - pFontName = WordToHtmlUtils.EMPTY; - } - blocksProperies.push( new BlockProperies( pFontName, pFontSize ) ); - } - try - { - if ( WordToHtmlUtils.isNotEmpty( bulletText ) ) - { - Text textNode = htmlDocumentFacade.createText( bulletText ); - pElement.appendChild( textNode ); - } - - List characterRuns = WordToHtmlUtils - .findCharacterRuns( paragraph ); - processCharacters( hwpfDocument, currentTableLevel, paragraph, - pElement, characterRuns, 0, characterRuns.size() ); - } - finally - { - blocksProperies.pop(); - } - - if ( style.length() > 0 ) - pElement.setAttribute( "style", style.toString() ); - - return; - } - - protected void processSection( HWPFDocumentCore wordDocument, - Section section, int sectionCounter ) - { - Element div = htmlDocumentFacade.document.createElement( "div" ); - div.setAttribute( "style", getSectionStyle( section ) ); - htmlDocumentFacade.body.appendChild( div ); - - processSectionParagraphes( wordDocument, div, section, 0 ); - } - - @Override - protected void processSingleSection( HWPFDocumentCore wordDocument, - Section section ) - { - htmlDocumentFacade.body.setAttribute( "style", - getSectionStyle( section ) ); - - processSectionParagraphes( wordDocument, htmlDocumentFacade.body, - section, 0 ); - } - - protected void processTable( HWPFDocumentCore hwpfDocument, Element flow, - Table table, int thisTableLevel ) - { - Element tableHeader = htmlDocumentFacade.createTableHeader(); - Element tableBody = htmlDocumentFacade.createTableBody(); - - final int tableRows = table.numRows(); - - int maxColumns = Integer.MIN_VALUE; - for ( int r = 0; r < tableRows; r++ ) - { - maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() ); - } - - for ( int r = 0; r < tableRows; r++ ) - { - TableRow tableRow = table.getRow( r ); - - Element tableRowElement = htmlDocumentFacade.createTableRow(); - StringBuilder tableRowStyle = new StringBuilder(); - WordToHtmlUtils.addTableRowProperties( tableRow, tableRowStyle ); - - final int rowCells = tableRow.numCells(); - for ( int c = 0; c < rowCells; c++ ) - { - TableCell tableCell = tableRow.getCell( c ); - - if ( tableCell.isMerged() && !tableCell.isFirstMerged() ) - continue; - - if ( tableCell.isVerticallyMerged() - && !tableCell.isFirstVerticallyMerged() ) - continue; - - Element tableCellElement; - if ( tableRow.isTableHeader() ) - { - tableCellElement = htmlDocumentFacade - .createTableHeaderCell(); - } - else - { - tableCellElement = htmlDocumentFacade.createTableCell(); - } - StringBuilder tableCellStyle = new StringBuilder(); - WordToHtmlUtils.addTableCellProperties( tableRow, tableCell, - r == 0, r == tableRows - 1, c == 0, c == rowCells - 1, - tableCellStyle ); - - if ( tableCell.isFirstMerged() ) - { - int count = 0; - for ( int c1 = c; c1 < rowCells; c1++ ) - { - TableCell nextCell = tableRow.getCell( c1 ); - if ( nextCell.isMerged() ) - count++; - if ( !nextCell.isMerged() ) - break; - } - tableCellElement.setAttribute( "colspan", "" + count ); - } - else - { - if ( c == rowCells - 1 && c != maxColumns - 1 ) - { - tableCellElement.setAttribute( "colspan", "" - + ( maxColumns - c ) ); - } - } - - if ( tableCell.isFirstVerticallyMerged() ) - { - int count = 0; - for ( int r1 = r; r1 < tableRows; r1++ ) - { - TableRow nextRow = table.getRow( r1 ); - if ( nextRow.numCells() < c ) - break; - TableCell nextCell = nextRow.getCell( c ); - if ( nextCell.isVerticallyMerged() ) - count++; - if ( !nextCell.isVerticallyMerged() ) - break; - } - tableCellElement.setAttribute( "rowspan", "" + count ); - } - - processSectionParagraphes( hwpfDocument, tableCellElement, - tableCell, thisTableLevel ); - - if ( !tableCellElement.hasChildNodes() ) - { - tableCellElement.appendChild( htmlDocumentFacade - .createParagraph() ); - } - if ( tableCellStyle.length() > 0 ) - tableCellElement.setAttribute( "style", - tableCellStyle.toString() ); - - tableRowElement.appendChild( tableCellElement ); - } - - if ( tableRowStyle.length() > 0 ) - tableRowElement - .setAttribute( "style", tableRowStyle.toString() ); - - if ( tableRow.isTableHeader() ) - { - tableHeader.appendChild( tableRowElement ); - } - else - { - tableBody.appendChild( tableRowElement ); - } - - } - - final Element tableElement = htmlDocumentFacade.createTable(); - if ( tableHeader.hasChildNodes() ) - { - tableElement.appendChild( tableHeader ); - } - if ( tableBody.hasChildNodes() ) - { - tableElement.appendChild( tableBody ); - flow.appendChild( tableElement ); - } - else - { - logger.log( - POILogger.WARN, - "Table without body starting on offset " - + table.getStartOffset() + " -- " - + table.getEndOffset() ); - } - } - -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java deleted file mode 100644 index 4417f62017..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java +++ /dev/null @@ -1,292 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import org.apache.poi.hwpf.usermodel.BorderCode; -import org.apache.poi.hwpf.usermodel.CharacterProperties; -import org.apache.poi.hwpf.usermodel.CharacterRun; -import org.apache.poi.hwpf.usermodel.Paragraph; -import org.apache.poi.hwpf.usermodel.Picture; -import org.apache.poi.hwpf.usermodel.TableCell; -import org.apache.poi.hwpf.usermodel.TableRow; -import org.w3c.dom.Element; - -public class WordToHtmlUtils extends AbstractWordUtils -{ - public static void addBold( final boolean bold, StringBuilder style ) - { - style.append( "font-weight: " + ( bold ? "bold" : "normal" ) + ";" ); - } - - public static void addBorder( BorderCode borderCode, String where, - StringBuilder style ) - { - if ( borderCode == null || borderCode.getBorderType() == 0 ) - return; - - if ( isEmpty( where ) ) - { - style.append( "border-style: " + getBorderType( borderCode ) + "; " ); - style.append( "border-color: " + getColor( borderCode.getColor() ) - + "; " ); - style.append( "border-width: " + getBorderWidth( borderCode ) - + "; " ); - } - else - { - style.append( "border-" + where + "-style: " - + getBorderType( borderCode ) + "; " ); - style.append( "border-" + where + "-color: " - + getColor( borderCode.getColor() ) + "; " ); - style.append( "border-" + where + "-width: " - + getBorderWidth( borderCode ) + "; " ); - } - } - - public static void addCharactersProperties( - final CharacterRun characterRun, StringBuilder style ) - { - final CharacterProperties clonedProperties = characterRun - .cloneProperties(); - - if ( characterRun.isBold() ) - { - style.append( "font-weight: bold; " ); - } - if ( characterRun.isItalic() ) - { - style.append( "font-style: italic; " ); - } - - addBorder( clonedProperties.getBrc(), EMPTY, style ); - - if ( characterRun.isCapitalized() ) - { - style.append( "text-transform: uppercase; " ); - } - if ( characterRun.isHighlighted() ) - { - style.append( "background-color: " - + getColor( clonedProperties.getIcoHighlight() ) + "; " ); - } - if ( characterRun.isStrikeThrough() ) - { - style.append( "text-decoration: line-through; " ); - } - if ( characterRun.isShadowed() ) - { - style.append( "text-shadow: " + characterRun.getFontSize() / 24 - + "pt; " ); - } - if ( characterRun.isSmallCaps() ) - { - style.append( "font-variant: small-caps; " ); - } - if ( characterRun.getSubSuperScriptIndex() == 1 ) - { - style.append( "baseline-shift: super; " ); - style.append( "font-size: smaller; " ); - } - if ( characterRun.getSubSuperScriptIndex() == 2 ) - { - style.append( "baseline-shift: sub; " ); - style.append( "font-size: smaller; " ); - } - if ( characterRun.getUnderlineCode() > 0 ) - { - style.append( "text-decoration: underline; " ); - } - if ( characterRun.isVanished() ) - { - style.append( "visibility: hidden; " ); - } - } - - public static void addFontFamily( final String fontFamily, - StringBuilder style ) - { - if ( isEmpty( fontFamily ) ) - return; - - style.append( "font-family: " + fontFamily ); - } - - public static void addFontSize( final int fontSize, StringBuilder style ) - { - style.append( "font-size: " + fontSize ); - } - - public static void addIndent( Paragraph paragraph, StringBuilder style ) - { - addIndent( style, "text-indent", paragraph.getFirstLineIndent() ); - addIndent( style, "start-indent", paragraph.getIndentFromLeft() ); - addIndent( style, "end-indent", paragraph.getIndentFromRight() ); - addIndent( style, "space-before", paragraph.getSpacingBefore() ); - addIndent( style, "space-after", paragraph.getSpacingAfter() ); - } - - private static void addIndent( StringBuilder style, final String cssName, - final int twipsValue ) - { - if ( twipsValue == 0 ) - return; - - style.append( cssName + ": " + ( twipsValue / TWIPS_PER_PT ) + "pt; " ); - } - - public static void addJustification( Paragraph paragraph, - final StringBuilder style ) - { - String justification = getJustification( paragraph.getJustification() ); - if ( isNotEmpty( justification ) ) - style.append( "text-align: " + justification + "; " ); - } - - public static void addParagraphProperties( Paragraph paragraph, - StringBuilder style ) - { - addIndent( paragraph, style ); - addJustification( paragraph, style ); - - addBorder( paragraph.getBottomBorder(), "bottom", style ); - addBorder( paragraph.getLeftBorder(), "left", style ); - addBorder( paragraph.getRightBorder(), "right", style ); - addBorder( paragraph.getTopBorder(), "top", style ); - - if ( paragraph.pageBreakBefore() ) - { - style.append( "break-before: page; " ); - } - - style.append( "hyphenate: " + paragraph.isAutoHyphenated() + "; " ); - - if ( paragraph.keepOnPage() ) - { - style.append( "keep-together.within-page: always; " ); - } - - if ( paragraph.keepWithNext() ) - { - style.append( "keep-with-next.within-page: always; " ); - } - - style.append( "linefeed-treatment: preserve; " ); - style.append( "white-space-collapse: false; " ); - } - - public static void addTableCellProperties( TableRow tableRow, - TableCell tableCell, boolean toppest, boolean bottomest, - boolean leftest, boolean rightest, StringBuilder style ) - { - style.append( "width: " + ( tableCell.getWidth() / TWIPS_PER_INCH ) - + "in; " ); - style.append( "padding-start: " - + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " ); - style.append( "padding-end: " - + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " ); - - BorderCode top = tableCell.getBrcTop() != null - && tableCell.getBrcTop().getBorderType() != 0 ? tableCell - .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow - .getHorizontalBorder(); - BorderCode bottom = tableCell.getBrcBottom() != null - && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell - .getBrcBottom() : bottomest ? tableRow.getBottomBorder() - : tableRow.getHorizontalBorder(); - - BorderCode left = tableCell.getBrcLeft() != null - && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell - .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow - .getVerticalBorder(); - BorderCode right = tableCell.getBrcRight() != null - && tableCell.getBrcRight().getBorderType() != 0 ? tableCell - .getBrcRight() : rightest ? tableRow.getRightBorder() - : tableRow.getVerticalBorder(); - - addBorder( bottom, "bottom", style ); - addBorder( left, "left", style ); - addBorder( right, "right", style ); - addBorder( top, "top", style ); - } - - public static void addTableRowProperties( TableRow tableRow, - StringBuilder style ) - { - if ( tableRow.getRowHeight() > 0 ) - { - style.append( "height: " - + ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in; " ); - } - if ( !tableRow.cantSplit() ) - { - style.append( "keep-together: always; " ); - } - } - - public static void setPictureProperties( Picture picture, - Element graphicElement ) - { - final int aspectRatioX = picture.getAspectRatioX(); - final int aspectRatioY = picture.getAspectRatioY(); - - if ( aspectRatioX > 0 ) - { - graphicElement - .setAttribute( "content-width", ( ( picture.getDxaGoal() - * aspectRatioX / 100 ) / TWIPS_PER_PT ) - + "pt" ); - } - else - graphicElement.setAttribute( "content-width", - ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" ); - - if ( aspectRatioY > 0 ) - graphicElement - .setAttribute( "content-height", ( ( picture.getDyaGoal() - * aspectRatioY / 100 ) / TWIPS_PER_PT ) - + "pt" ); - else - graphicElement.setAttribute( "content-height", - ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" ); - - if ( aspectRatioX <= 0 || aspectRatioY <= 0 ) - { - graphicElement.setAttribute( "scaling", "uniform" ); - } - else - { - graphicElement.setAttribute( "scaling", "non-uniform" ); - } - - graphicElement.setAttribute( "vertical-align", "text-bottom" ); - - if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0 - || picture.getDyaCropBottom() != 0 - || picture.getDxaCropLeft() != 0 ) - { - int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT; - int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT; - int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT; - int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT; - graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, " - + rectRight + "pt, " + rectBottom + "pt, " + rectLeft - + "pt)" ); - graphicElement.setAttribute( "oveerflow", "hidden" ); - } - } - -} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java new file mode 100644 index 0000000000..570c8d155d --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java @@ -0,0 +1,114 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.StringWriter; +import java.util.Arrays; +import java.util.List; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import org.apache.poi.POIDataSamples; +import org.apache.poi.hwpf.HWPFDocumentCore; + +public class TestWordToConverterSuite +{ + /** + * YK: a quick hack to exclude failing documents from the suite. + */ + private static List failingFiles = Arrays.asList(); + + public static Test suite() + { + TestSuite suite = new TestSuite(); + + File directory = POIDataSamples.getDocumentInstance().getFile( + "../document" ); + for ( final File child : directory.listFiles( new FilenameFilter() + { + public boolean accept( File dir, String name ) + { + return name.endsWith( ".doc" ) && !failingFiles.contains( name ); + } + } ) ) + { + final String name = child.getName(); + + suite.addTest( new TestCase( name + " [FO]" ) + { + public void runTest() throws Exception + { + test( child, false ); + } + } ); + suite.addTest( new TestCase( name + " [HTML]" ) + { + public void runTest() throws Exception + { + test( child, true ); + } + } ); + + } + + return suite; + } + + protected static void test( File child, boolean html ) throws Exception + { + HWPFDocumentCore hwpfDocument; + try + { + hwpfDocument = AbstractWordUtils.loadDoc( child ); + } + catch ( Exception exc ) + { + // unable to parse file -- not WordToFoConverter fault + return; + } + + WordToFoConverter wordToFoConverter = new WordToFoConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToFoConverter.processDocument( hwpfDocument ); + + StringWriter stringWriter = new StringWriter(); + + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); + transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); + transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.transform( + new DOMSource( wordToFoConverter.getDocument() ), + new StreamResult( stringWriter ) ); + + if ( html ) + transformer.setOutputProperty( OutputKeys.METHOD, "html" ); + + // no exceptions + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToFoConverter.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToFoConverter.java new file mode 100644 index 0000000000..33321987f1 --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToFoConverter.java @@ -0,0 +1,95 @@ +/* + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ +package org.apache.poi.hwpf.converter; + +import java.io.StringWriter; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import junit.framework.TestCase; +import org.apache.poi.POIDataSamples; +import org.apache.poi.hwpf.HWPFDocument; + +/** + * Test cases for {@link WordToFoConverter} + * + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ +public class TestWordToFoConverter extends TestCase +{ + private static String getFoText( final String sampleFileName ) + throws Exception + { + HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples + .getDocumentInstance().openResourceAsStream( sampleFileName ) ); + + WordToFoConverter wordToFoConverter = new WordToFoConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToFoConverter.processDocument( hwpfDocument ); + + StringWriter stringWriter = new StringWriter(); + + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); + transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.transform( + new DOMSource( wordToFoConverter.getDocument() ), + new StreamResult( stringWriter ) ); + + String result = stringWriter.toString(); + return result; + } + + public void testEquation() throws Exception + { + final String sampleFileName = "equation.doc"; + String result = getFoText( sampleFileName ); + + assertTrue( result + .contains( "" ) ); + } + + public void testHyperlink() throws Exception + { + final String sampleFileName = "hyperlink.doc"; + String result = getFoText( sampleFileName ); + + assertTrue( result + .contains( "" ) ); + assertTrue( result.contains( "Hyperlink text" ) ); + } + + public void testPageref() throws Exception + { + final String sampleFileName = "pageref.doc"; + String result = getFoText( sampleFileName ); + + System.out.println( result ); + + assertTrue( result + .contains( "" ) ); + assertTrue( result.contains( "1" ) ); + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToHtmlConverter.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToHtmlConverter.java new file mode 100644 index 0000000000..890bce6e6c --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToHtmlConverter.java @@ -0,0 +1,95 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.converter; + +import java.io.StringWriter; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import junit.framework.TestCase; +import org.apache.poi.POIDataSamples; +import org.apache.poi.hwpf.HWPFDocument; + +/** + * Test cases for {@link WordToFoConverter} + * + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ +public class TestWordToHtmlConverter extends TestCase +{ + private static String getHtmlText( final String sampleFileName ) + throws Exception + { + HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples + .getDocumentInstance().openResourceAsStream( sampleFileName ) ); + + WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToHtmlConverter.processDocument( hwpfDocument ); + + StringWriter stringWriter = new StringWriter(); + + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); + transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); + transformer.setOutputProperty( OutputKeys.METHOD, "html" ); + transformer.transform( + new DOMSource( wordToHtmlConverter.getDocument() ), + new StreamResult( stringWriter ) ); + + String result = stringWriter.toString(); + return result; + } + + public void testBug46610_2() throws Exception + { + String result = getHtmlText( "Bug46610_2.doc" ); + assertTrue( result + .contains( "012345678911234567892123456789312345678941234567890123456789112345678921234567893123456789412345678" ) ); + } + + public void testEquation() throws Exception + { + String result = getHtmlText( "equation.doc" ); + + assertTrue( result + .contains( "" ) ); + } + + public void testHyperlink() throws Exception + { + String result = getHtmlText( "hyperlink.doc" ); + + assertTrue( result.contains( "" ) ); + assertTrue( result.contains( "Hyperlink text" ) ); + } + + public void testPageref() throws Exception + { + String result = getHtmlText( "pageref.doc" ); + + assertTrue( result.contains( "" ) ); + assertTrue( result.contains( "1" ) ); + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java deleted file mode 100644 index 62cfb999bc..0000000000 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java +++ /dev/null @@ -1,114 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import java.io.File; -import java.io.FilenameFilter; -import java.io.StringWriter; -import java.util.Arrays; -import java.util.List; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - -import junit.framework.Test; -import junit.framework.TestCase; -import junit.framework.TestSuite; -import org.apache.poi.POIDataSamples; -import org.apache.poi.hwpf.HWPFDocumentCore; - -public class TestWordToExtractorSuite -{ - /** - * YK: a quick hack to exclude failing documents from the suite. - */ - private static List failingFiles = Arrays.asList(); - - public static Test suite() - { - TestSuite suite = new TestSuite(); - - File directory = POIDataSamples.getDocumentInstance().getFile( - "../document" ); - for ( final File child : directory.listFiles( new FilenameFilter() - { - public boolean accept( File dir, String name ) - { - return name.endsWith( ".doc" ) && !failingFiles.contains( name ); - } - } ) ) - { - final String name = child.getName(); - - suite.addTest( new TestCase( name + " [FO]" ) - { - public void runTest() throws Exception - { - test( child, false ); - } - } ); - suite.addTest( new TestCase( name + " [HTML]" ) - { - public void runTest() throws Exception - { - test( child, true ); - } - } ); - - } - - return suite; - } - - protected static void test( File child, boolean html ) throws Exception - { - HWPFDocumentCore hwpfDocument; - try - { - hwpfDocument = AbstractWordUtils.loadDoc( child ); - } - catch ( Exception exc ) - { - // unable to parse file -- not WordToFoExtractor fault - return; - } - - WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( - DocumentBuilderFactory.newInstance().newDocumentBuilder() - .newDocument() ); - wordToFoExtractor.processDocument( hwpfDocument ); - - StringWriter stringWriter = new StringWriter(); - - Transformer transformer = TransformerFactory.newInstance() - .newTransformer(); - transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); - transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); - transformer.transform( - new DOMSource( wordToFoExtractor.getDocument() ), - new StreamResult( stringWriter ) ); - - if ( html ) - transformer.setOutputProperty( OutputKeys.METHOD, "html" ); - - // no exceptions - } -} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractor.java deleted file mode 100644 index 8bcd5bb21c..0000000000 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractor.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * ==================================================================== - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ==================================================================== - */ -package org.apache.poi.hwpf.extractor; - -import java.io.StringWriter; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - -import junit.framework.TestCase; -import org.apache.poi.POIDataSamples; -import org.apache.poi.hwpf.HWPFDocument; - -/** - * Test cases for {@link WordToFoExtractor} - * - * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) - */ -public class TestWordToFoExtractor extends TestCase -{ - private static String getFoText( final String sampleFileName ) - throws Exception - { - HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples - .getDocumentInstance().openResourceAsStream( sampleFileName ) ); - - WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( - DocumentBuilderFactory.newInstance().newDocumentBuilder() - .newDocument() ); - wordToFoExtractor.processDocument( hwpfDocument ); - - StringWriter stringWriter = new StringWriter(); - - Transformer transformer = TransformerFactory.newInstance() - .newTransformer(); - transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); - transformer.transform( - new DOMSource( wordToFoExtractor.getDocument() ), - new StreamResult( stringWriter ) ); - - String result = stringWriter.toString(); - return result; - } - - public void testHyperlink() throws Exception - { - final String sampleFileName = "hyperlink.doc"; - String result = getFoText( sampleFileName ); - - assertTrue( result - .contains( "" ) ); - assertTrue( result.contains( "Hyperlink text" ) ); - } - - public void testEquation() throws Exception - { - final String sampleFileName = "equation.doc"; - String result = getFoText( sampleFileName ); - - assertTrue( result - .contains( "" ) ); - } - - public void testPageref() throws Exception - { - final String sampleFileName = "pageref.doc"; - String result = getFoText( sampleFileName ); - - System.out.println( result ); - - assertTrue( result - .contains( "" ) ); - assertTrue( result.contains( "1" ) ); - } -} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java deleted file mode 100644 index f758e6fe24..0000000000 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java +++ /dev/null @@ -1,95 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import java.io.StringWriter; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - -import junit.framework.TestCase; -import org.apache.poi.POIDataSamples; -import org.apache.poi.hwpf.HWPFDocument; - -/** - * Test cases for {@link WordToFoExtractor} - * - * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) - */ -public class TestWordToHtmlExtractor extends TestCase -{ - private static String getHtmlText( final String sampleFileName ) - throws Exception - { - HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples - .getDocumentInstance().openResourceAsStream( sampleFileName ) ); - - WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor( - DocumentBuilderFactory.newInstance().newDocumentBuilder() - .newDocument() ); - wordToHtmlExtractor.processDocument( hwpfDocument ); - - StringWriter stringWriter = new StringWriter(); - - Transformer transformer = TransformerFactory.newInstance() - .newTransformer(); - transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); - transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); - transformer.setOutputProperty( OutputKeys.METHOD, "html" ); - transformer.transform( - new DOMSource( wordToHtmlExtractor.getDocument() ), - new StreamResult( stringWriter ) ); - - String result = stringWriter.toString(); - return result; - } - - public void testBug46610_2() throws Exception - { - String result = getHtmlText( "Bug46610_2.doc" ); - assertTrue( result - .contains( "012345678911234567892123456789312345678941234567890123456789112345678921234567893123456789412345678" ) ); - } - - public void testEquation() throws Exception - { - String result = getHtmlText( "equation.doc" ); - - assertTrue( result - .contains( "" ) ); - } - - public void testHyperlink() throws Exception - { - String result = getHtmlText( "hyperlink.doc" ); - - assertTrue( result.contains( "" ) ); - assertTrue( result.contains( "Hyperlink text" ) ); - } - - public void testPageref() throws Exception - { - String result = getHtmlText( "pageref.doc" ); - - assertTrue( result.contains( "" ) ); - assertTrue( result.contains( "1" ) ); - } -}