From: Sergey Vladimirov Date: Mon, 4 Jul 2011 19:08:06 +0000 (+0000) Subject: add Word-to-HTML extractor X-Git-Tag: REL_3_8_BETA4~339 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=1bee2b0cdb673256fb62271b3eee3eb909742052;p=poi.git add Word-to-HTML extractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1142765 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java deleted file mode 100644 index 19608d80a5..0000000000 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * ==================================================================== - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ==================================================================== - */ -package org.apache.poi.hwpf.extractor; - -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Text; - -public abstract class AbstractToFoExtractor -{ - - private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format"; - - protected final Document document; - protected final Element layoutMasterSet; - protected final Element root; - - public AbstractToFoExtractor( Document document ) - { - this.document = document; - - root = document.createElementNS( NS_XSLFO, "fo:root" ); - document.appendChild( root ); - - layoutMasterSet = document.createElementNS( NS_XSLFO, - "fo:layout-master-set" ); - root.appendChild( layoutMasterSet ); - } - - protected Element addFlowToPageSequence( final Element pageSequence, - String flowName ) - { - final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" ); - flow.setAttribute( "flow-name", flowName ); - pageSequence.appendChild( flow ); - - return flow; - } - - protected Element addListItem( Element listBlock ) - { - Element result = createListItem(); - listBlock.appendChild( result ); - return result; - } - - protected Element addListItemBody( Element listItem ) - { - Element result = createListItemBody(); - listItem.appendChild( result ); - return result; - } - - protected Element addListItemLabel( Element listItem, String text ) - { - Element result = createListItemLabel( text ); - listItem.appendChild( result ); - return result; - } - - protected Element addPageSequence( String pageMaster ) - { - final Element pageSequence = document.createElementNS( NS_XSLFO, - "fo:page-sequence" ); - pageSequence.setAttribute( "master-reference", pageMaster ); - root.appendChild( pageSequence ); - return pageSequence; - } - - protected Element addRegionBody( Element pageMaster ) - { - final Element regionBody = document.createElementNS( NS_XSLFO, - "fo:region-body" ); - pageMaster.appendChild( regionBody ); - - return regionBody; - } - - protected Element addSimplePageMaster( String masterName ) - { - final Element simplePageMaster = document.createElementNS( NS_XSLFO, - "fo:simple-page-master" ); - simplePageMaster.setAttribute( "master-name", masterName ); - layoutMasterSet.appendChild( simplePageMaster ); - - return simplePageMaster; - } - - protected Element createBasicLinkExternal( String externalDestination ) - { - final Element basicLink = document.createElementNS( NS_XSLFO, - "fo:basic-link" ); - basicLink.setAttribute( "external-destination", externalDestination ); - return basicLink; - } - - protected Element createBasicLinkInternal( String internalDestination ) - { - final Element basicLink = document.createElementNS( NS_XSLFO, - "fo:basic-link" ); - basicLink.setAttribute( "internal-destination", internalDestination ); - return basicLink; - } - - protected Element createBlock() - { - return document.createElementNS( NS_XSLFO, "fo:block" ); - } - - protected Element createExternalGraphic( String source ) - { - Element result = document.createElementNS( NS_XSLFO, - "fo:external-graphic" ); - result.setAttribute( "src", "url('" + source + "')" ); - return result; - } - - protected Element createInline() - { - return document.createElementNS( NS_XSLFO, "fo:inline" ); - } - - protected Element createLeader() - { - return document.createElementNS( NS_XSLFO, "fo:leader" ); - } - - protected Element createListBlock() - { - return document.createElementNS( NS_XSLFO, "fo:list-block" ); - } - - protected Element createListItem() - { - return document.createElementNS( NS_XSLFO, "fo:list-item" ); - } - - protected Element createListItemBody() - { - return document.createElementNS( NS_XSLFO, "fo:list-item-body" ); - } - - protected Element createListItemLabel( String text ) - { - Element result = document.createElementNS( NS_XSLFO, - "fo:list-item-label" ); - Element block = createBlock(); - block.appendChild( document.createTextNode( text ) ); - result.appendChild( block ); - return result; - } - - protected Element createTable() - { - return document.createElementNS( NS_XSLFO, "fo:table" ); - } - - protected Element createTableBody() - { - return document.createElementNS( NS_XSLFO, "fo:table-body" ); - } - - protected Element createTableCell() - { - return document.createElementNS( NS_XSLFO, "fo:table-cell" ); - } - - protected Element createTableHeader() - { - return document.createElementNS( NS_XSLFO, "fo:table-header" ); - } - - protected Element createTableRow() - { - return document.createElementNS( NS_XSLFO, "fo:table-row" ); - } - - protected Text createText( String data ) - { - return document.createTextNode( data ); - } - - public Document getDocument() - { - return document; - } - -} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java new file mode 100644 index 0000000000..f13d9a1f6c --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordExtractor.java @@ -0,0 +1,365 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; +import org.apache.poi.hwpf.model.ListFormatOverride; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.usermodel.Section; +import org.apache.poi.hwpf.usermodel.Table; +import org.apache.poi.hwpf.usermodel.TableIterator; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.w3c.dom.Document; +import org.w3c.dom.Element; + +public abstract class AbstractWordExtractor +{ + private static final byte BEL_MARK = 7; + + private static final byte FIELD_BEGIN_MARK = 19; + + private static final byte FIELD_END_MARK = 21; + + private static final byte FIELD_SEPARATOR_MARK = 20; + + private static final POILogger logger = POILogFactory + .getLogger( AbstractWordExtractor.class ); + + public abstract Document getDocument(); + + protected abstract void outputCharacters( Element block, + CharacterRun characterRun, String text ); + + protected boolean processCharacters( HWPFDocumentCore hwpfDocument, + int currentTableLevel, Paragraph paragraph, final Element block, + List characterRuns, final int start, final int end ) + { + boolean haveAnyText = false; + + for ( int c = start; c < end; c++ ) + { + CharacterRun characterRun = characterRuns.get( c ); + + if ( characterRun == null ) + throw new AssertionError(); + + if ( hwpfDocument instanceof HWPFDocument + && ( (HWPFDocument) hwpfDocument ).getPicturesTable() + .hasPicture( characterRun ) ) + { + HWPFDocument newFormat = (HWPFDocument) hwpfDocument; + Picture picture = newFormat.getPicturesTable().extractPicture( + characterRun, true ); + + processImage( block, characterRun.text().charAt( 0 ) == 0x01, + picture ); + continue; + } + + String text = characterRun.text(); + if ( text.getBytes().length == 0 ) + continue; + + if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) + { + int skipTo = tryField( hwpfDocument, paragraph, + currentTableLevel, characterRuns, c, block ); + + if ( skipTo != c ) + { + c = skipTo; + continue; + } + + continue; + } + if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) + { + // shall not appear without FIELD_BEGIN_MARK + continue; + } + if ( text.getBytes()[0] == FIELD_END_MARK ) + { + // shall not appear without FIELD_BEGIN_MARK + continue; + } + + if ( characterRun.isSpecialCharacter() || characterRun.isObj() + || characterRun.isOle2() ) + { + continue; + } + + if ( text.endsWith( "\r" ) + || ( text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0 ) ) + text = text.substring( 0, text.length() - 1 ); + + outputCharacters( block, characterRun, text ); + + haveAnyText |= text.trim().length() != 0; + } + + return haveAnyText; + } + + public void processDocument( HWPFDocumentCore wordDocument ) + { + final Range range = wordDocument.getRange(); + for ( int s = 0; s < range.numSections(); s++ ) + { + processSection( wordDocument, range.getSection( s ), s ); + } + } + + protected void processField( HWPFDocumentCore wordDocument, + Element currentBlock, Paragraph paragraph, int currentTableLevel, + List characterRuns, int beginMark, int separatorMark, + int endMark ) + { + + Pattern hyperlinkPattern = Pattern + .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" ); + Pattern pagerefPattern = Pattern + .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" ); + + if ( separatorMark - beginMark > 1 ) + { + int index = beginMark + 1; + CharacterRun firstAfterBegin = null; + while ( index < separatorMark ) + { + firstAfterBegin = paragraph.getCharacterRun( index ); + if ( firstAfterBegin == null ) + { + logger.log( POILogger.WARN, + "Paragraph " + paragraph.getStartOffset() + "--" + + paragraph.getEndOffset() + + " contains null CharacterRun #" + index ); + index++; + continue; + } + break; + } + + if ( firstAfterBegin != null ) + { + final Matcher hyperlinkMatcher = hyperlinkPattern + .matcher( firstAfterBegin.text() ); + if ( hyperlinkMatcher.matches() ) + { + String hyperlink = hyperlinkMatcher.group( 1 ); + processHyperlink( wordDocument, currentBlock, paragraph, + characterRuns, currentTableLevel, hyperlink, + separatorMark + 1, endMark ); + return; + } + + final Matcher pagerefMatcher = pagerefPattern + .matcher( firstAfterBegin.text() ); + if ( pagerefMatcher.matches() ) + { + String pageref = pagerefMatcher.group( 1 ); + processPageref( wordDocument, currentBlock, paragraph, + characterRuns, currentTableLevel, pageref, + separatorMark + 1, endMark ); + return; + } + } + } + + StringBuilder debug = new StringBuilder( "Unsupported field type: \n" ); + for ( int i = beginMark; i <= endMark; i++ ) + { + debug.append( "\t" ); + debug.append( paragraph.getCharacterRun( i ) ); + debug.append( "\n" ); + } + logger.log( POILogger.WARN, debug ); + + // just output field value + if ( separatorMark + 1 < endMark ) + processCharacters( wordDocument, currentTableLevel, paragraph, + currentBlock, characterRuns, separatorMark + 1, endMark ); + + return; + } + + protected abstract void processHyperlink( HWPFDocumentCore wordDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String hyperlink, int i, int endMark ); + + protected abstract void processImage( Element currentBlock, + boolean inlined, Picture picture ); + + protected abstract void processPageref( HWPFDocumentCore wordDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String pageref, int beginTextInclusive, int endTextExclusive ); + + protected abstract void processParagraph( HWPFDocumentCore wordDocument, + Element parentFopElement, int currentTableLevel, + Paragraph paragraph, String bulletText ); + + protected abstract void processSection( HWPFDocumentCore wordDocument, + Section section, int s ); + + protected void processSectionParagraphes( HWPFDocumentCore wordDocument, + Element flow, Range range, int currentTableLevel ) + { + final Map allTables = new HashMap(); + for ( TableIterator tableIterator = AbstractWordUtils.newTableIterator( + range, currentTableLevel + 1 ); tableIterator.hasNext(); ) + { + Table next = tableIterator.next(); + allTables.put( Integer.valueOf( next.getStartOffset() ), next ); + } + + final ListTables listTables = wordDocument.getListTables(); + int currentListInfo = 0; + + final int paragraphs = range.numParagraphs(); + for ( int p = 0; p < paragraphs; p++ ) + { + Paragraph paragraph = range.getParagraph( p ); + + if ( allTables.containsKey( Integer.valueOf( paragraph + .getStartOffset() ) ) ) + { + Table table = allTables.get( Integer.valueOf( paragraph + .getStartOffset() ) ); + processTable( wordDocument, flow, table, currentTableLevel + 1 ); + continue; + } + + if ( paragraph.isInTable() + && paragraph.getTableLevel() != currentTableLevel ) + { + continue; + } + + if ( paragraph.getIlfo() != currentListInfo ) + { + currentListInfo = paragraph.getIlfo(); + } + + if ( currentListInfo != 0 ) + { + if ( listTables != null ) + { + final ListFormatOverride listFormatOverride = listTables + .getOverride( paragraph.getIlfo() ); + + String label = AbstractWordUtils.getBulletText( listTables, + paragraph, listFormatOverride.getLsid() ); + + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, label ); + } + else + { + logger.log( POILogger.WARN, + "Paragraph #" + paragraph.getStartOffset() + "-" + + paragraph.getEndOffset() + + " has reference to list structure #" + + currentListInfo + + ", but listTables not defined in file" ); + + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, AbstractWordUtils.EMPTY ); + } + } + else + { + processParagraph( wordDocument, flow, currentTableLevel, + paragraph, AbstractWordUtils.EMPTY ); + } + } + + } + + protected void processSingleSection( HWPFDocumentCore wordDocument, + Section section ) + { + processSection( wordDocument, section, 0 ); + } + + protected abstract void processTable( HWPFDocumentCore wordDocument, + Element flow, Table table, int newTableLevel ); + + protected int tryField( HWPFDocumentCore wordDocument, Paragraph paragraph, + int currentTableLevel, List characterRuns, + int beginMark, Element currentBlock ) + { + int separatorMark = -1; + int endMark = -1; + for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ ) + { + CharacterRun characterRun = paragraph.getCharacterRun( c ); + + String text = characterRun.text(); + if ( text.getBytes().length == 0 ) + continue; + + if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) + { + if ( separatorMark != -1 ) + { + // double; + return beginMark; + } + + separatorMark = c; + continue; + } + + if ( text.getBytes()[0] == FIELD_END_MARK ) + { + if ( endMark != -1 ) + { + // double; + return beginMark; + } + + endMark = c; + break; + } + + } + + if ( separatorMark == -1 || endMark == -1 ) + return beginMark; + + processField( wordDocument, currentBlock, paragraph, currentTableLevel, + characterRuns, beginMark, separatorMark, endMark ); + + return endMark; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java new file mode 100644 index 0000000000..89849c15eb --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractWordUtils.java @@ -0,0 +1,404 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; +import org.apache.poi.hwpf.HWPFOldDocument; +import org.apache.poi.hwpf.OldWordFileFormatException; +import org.apache.poi.hwpf.model.CHPX; +import org.apache.poi.hwpf.model.ListLevel; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.usermodel.BorderCode; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.hwpf.usermodel.Section; +import org.apache.poi.hwpf.usermodel.SectionProperties; +import org.apache.poi.hwpf.usermodel.TableIterator; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +public class AbstractWordUtils +{ + static final String EMPTY = ""; + + private static final POILogger logger = POILogFactory + .getLogger( AbstractWordUtils.class ); + + public static final float TWIPS_PER_INCH = 1440.0f; + public static final int TWIPS_PER_PT = 20; + + static void closeQuietly( final Closeable closeable ) + { + try + { + closeable.close(); + } + catch ( Exception exc ) + { + logger.log( POILogger.ERROR, "Unable to close resource: " + exc, + exc ); + } + } + + static boolean equals( String str1, String str2 ) + { + return str1 == null ? str2 == null : str1.equals( str2 ); + } + + // XXX incorporate into Range + static List findCharacterRuns( Range range ) + { + final int min = range.getStartOffset(); + final int max = range.getEndOffset(); + + List result = new ArrayList(); + List chpxs = getCharacters( range ); + for ( int i = 0; i < chpxs.size(); i++ ) + { + CHPX chpx = chpxs.get( i ); + if ( chpx == null ) + continue; + + if ( Math.max( min, chpx.getStart() ) <= Math.min( max, + chpx.getEnd() ) ) + { + final CharacterRun characterRun = getCharacterRun( range, chpx ); + + if ( characterRun == null ) + continue; + + result.add( characterRun ); + } + } + + return result; + } + + public static String getBorderType( BorderCode borderCode ) + { + if ( borderCode == null ) + throw new IllegalArgumentException( "borderCode is null" ); + + switch ( borderCode.getBorderType() ) + { + case 1: + case 2: + return "solid"; + case 3: + return "double"; + case 5: + return "solid"; + case 6: + return "dotted"; + case 7: + case 8: + return "dashed"; + case 9: + return "dotted"; + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + case 16: + case 17: + case 18: + case 19: + return "double"; + case 20: + return "solid"; + case 21: + return "double"; + case 22: + return "dashed"; + case 23: + return "dashed"; + case 24: + return "ridge"; + case 25: + return "grooved"; + default: + return "solid"; + } + } + + public static String getBorderWidth( BorderCode borderCode ) + { + int lineWidth = borderCode.getLineWidth(); + int pt = lineWidth / 8; + int pte = lineWidth - pt * 8; + + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append( pt ); + stringBuilder.append( "." ); + stringBuilder.append( 1000 / 8 * pte ); + stringBuilder.append( "pt" ); + return stringBuilder.toString(); + } + + public static String getBulletText( ListTables listTables, + Paragraph paragraph, int listId ) + { + final ListLevel listLevel = listTables.getLevel( listId, + paragraph.getIlvl() ); + + if ( listLevel.getNumberText() == null ) + return EMPTY; + + StringBuffer bulletBuffer = new StringBuffer(); + char[] xst = listLevel.getNumberText().toCharArray(); + for ( char element : xst ) + { + if ( element < 9 ) + { + ListLevel numLevel = listTables.getLevel( listId, element ); + + int num = numLevel.getStartAt(); + bulletBuffer.append( NumberFormatter.getNumber( num, + listLevel.getNumberFormat() ) ); + + if ( numLevel == listLevel ) + { + numLevel.setStartAt( numLevel.getStartAt() + 1 ); + } + + } + else + { + bulletBuffer.append( element ); + } + } + + byte follow = getIxchFollow( listLevel ); + switch ( follow ) + { + case 0: + bulletBuffer.append( "\t" ); + break; + case 1: + bulletBuffer.append( " " ); + break; + default: + break; + } + + return bulletBuffer.toString(); + } + + private static CharacterRun getCharacterRun( Range range, CHPX chpx ) + { + try + { + Method method = Range.class.getDeclaredMethod( "getCharacterRun", + CHPX.class ); + method.setAccessible( true ); + return (CharacterRun) method.invoke( range, chpx ); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + + private static List getCharacters( Range range ) + { + try + { + Field field = Range.class.getDeclaredField( "_characters" ); + field.setAccessible( true ); + return (List) field.get( range ); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + + public static String getColor( int ico ) + { + switch ( ico ) + { + case 1: + return "black"; + case 2: + return "blue"; + case 3: + return "cyan"; + case 4: + return "green"; + case 5: + return "magenta"; + case 6: + return "red"; + case 7: + return "yellow"; + case 8: + return "white"; + case 9: + return "darkblue"; + case 10: + return "darkcyan"; + case 11: + return "darkgreen"; + case 12: + return "darkmagenta"; + case 13: + return "darkred"; + case 14: + return "darkyellow"; + case 15: + return "darkgray"; + case 16: + return "lightgray"; + default: + return "black"; + } + } + + public static byte getIxchFollow( ListLevel listLevel ) + { + try + { + Field field = ListLevel.class.getDeclaredField( "_ixchFollow" ); + field.setAccessible( true ); + return ( (Byte) field.get( listLevel ) ).byteValue(); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + + public static String getJustification( int js ) + { + switch ( js ) + { + case 0: + return "start"; + case 1: + return "center"; + case 2: + return "end"; + case 3: + case 4: + return "justify"; + case 5: + return "center"; + case 6: + return "left"; + case 7: + return "start"; + case 8: + return "end"; + case 9: + return "justify"; + } + return ""; + } + + public static String getListItemNumberLabel( int number, int format ) + { + + if ( format != 0 ) + System.err.println( "NYI: toListItemNumberLabel(): " + format ); + + return String.valueOf( number ); + } + + public static SectionProperties getSectionProperties( Section section ) + { + try + { + Field field = Section.class.getDeclaredField( "_props" ); + field.setAccessible( true ); + return (SectionProperties) field.get( section ); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + + static boolean isEmpty( String str ) + { + return str == null || str.length() == 0; + } + + static boolean isNotEmpty( String str ) + { + return !isEmpty( str ); + } + + public static HWPFDocumentCore loadDoc( File docFile ) throws IOException + { + final FileInputStream istream = new FileInputStream( docFile ); + try + { + return loadDoc( istream ); + } + finally + { + closeQuietly( istream ); + } + } + + public static HWPFDocumentCore loadDoc( InputStream inputStream ) + throws IOException + { + final POIFSFileSystem poifsFileSystem = HWPFDocumentCore + .verifyAndBuildPOIFS( inputStream ); + try + { + return new HWPFDocument( poifsFileSystem ); + } + catch ( OldWordFileFormatException exc ) + { + return new HWPFOldDocument( poifsFileSystem ); + } + } + + public static TableIterator newTableIterator( Range range, int level ) + { + try + { + Constructor constructor = TableIterator.class + .getDeclaredConstructor( Range.class, int.class ); + constructor.setAccessible( true ); + return constructor.newInstance( range, Integer.valueOf( level ) ); + } + catch ( Exception exc ) + { + throw new Error( exc ); + } + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java new file mode 100644 index 0000000000..5e474bf254 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/FoDocumentFacade.java @@ -0,0 +1,201 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +public class FoDocumentFacade +{ + private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format"; + + protected final Document document; + protected final Element layoutMasterSet; + protected final Element root; + + public FoDocumentFacade( Document document ) + { + this.document = document; + + root = document.createElementNS( NS_XSLFO, "fo:root" ); + document.appendChild( root ); + + layoutMasterSet = document.createElementNS( NS_XSLFO, + "fo:layout-master-set" ); + root.appendChild( layoutMasterSet ); + } + + public Element addFlowToPageSequence( final Element pageSequence, + String flowName ) + { + final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" ); + flow.setAttribute( "flow-name", flowName ); + pageSequence.appendChild( flow ); + + return flow; + } + + public Element addListItem( Element listBlock ) + { + Element result = createListItem(); + listBlock.appendChild( result ); + return result; + } + + public Element addListItemBody( Element listItem ) + { + Element result = createListItemBody(); + listItem.appendChild( result ); + return result; + } + + public Element addListItemLabel( Element listItem, String text ) + { + Element result = createListItemLabel( text ); + listItem.appendChild( result ); + return result; + } + + public Element addPageSequence( String pageMaster ) + { + final Element pageSequence = document.createElementNS( NS_XSLFO, + "fo:page-sequence" ); + pageSequence.setAttribute( "master-reference", pageMaster ); + root.appendChild( pageSequence ); + return pageSequence; + } + + public Element addRegionBody( Element pageMaster ) + { + final Element regionBody = document.createElementNS( NS_XSLFO, + "fo:region-body" ); + pageMaster.appendChild( regionBody ); + + return regionBody; + } + + public Element addSimplePageMaster( String masterName ) + { + final Element simplePageMaster = document.createElementNS( NS_XSLFO, + "fo:simple-page-master" ); + simplePageMaster.setAttribute( "master-name", masterName ); + layoutMasterSet.appendChild( simplePageMaster ); + + return simplePageMaster; + } + + protected Element createBasicLinkExternal( String externalDestination ) + { + final Element basicLink = document.createElementNS( NS_XSLFO, + "fo:basic-link" ); + basicLink.setAttribute( "external-destination", externalDestination ); + return basicLink; + } + + public Element createBasicLinkInternal( String internalDestination ) + { + final Element basicLink = document.createElementNS( NS_XSLFO, + "fo:basic-link" ); + basicLink.setAttribute( "internal-destination", internalDestination ); + return basicLink; + } + + public Element createBlock() + { + return document.createElementNS( NS_XSLFO, "fo:block" ); + } + + public Element createExternalGraphic( String source ) + { + Element result = document.createElementNS( NS_XSLFO, + "fo:external-graphic" ); + result.setAttribute( "src", "url('" + source + "')" ); + return result; + } + + public Element createInline() + { + return document.createElementNS( NS_XSLFO, "fo:inline" ); + } + + public Element createLeader() + { + return document.createElementNS( NS_XSLFO, "fo:leader" ); + } + + public Element createListBlock() + { + return document.createElementNS( NS_XSLFO, "fo:list-block" ); + } + + public Element createListItem() + { + return document.createElementNS( NS_XSLFO, "fo:list-item" ); + } + + public Element createListItemBody() + { + return document.createElementNS( NS_XSLFO, "fo:list-item-body" ); + } + + public Element createListItemLabel( String text ) + { + Element result = document.createElementNS( NS_XSLFO, + "fo:list-item-label" ); + Element block = createBlock(); + block.appendChild( document.createTextNode( text ) ); + result.appendChild( block ); + return result; + } + + protected Element createTable() + { + return document.createElementNS( NS_XSLFO, "fo:table" ); + } + + protected Element createTableBody() + { + return document.createElementNS( NS_XSLFO, "fo:table-body" ); + } + + protected Element createTableCell() + { + return document.createElementNS( NS_XSLFO, "fo:table-cell" ); + } + + protected Element createTableHeader() + { + return document.createElementNS( NS_XSLFO, "fo:table-header" ); + } + + protected Element createTableRow() + { + return document.createElementNS( NS_XSLFO, "fo:table-row" ); + } + + protected Text createText( String data ) + { + return document.createTextNode( data ); + } + + public Document getDocument() + { + return document; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java new file mode 100644 index 0000000000..5e2b1f0166 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/HtmlDocumentFacade.java @@ -0,0 +1,107 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +public class HtmlDocumentFacade +{ + + protected final Element body; + protected final Document document; + protected final Element head; + protected final Element html; + + public HtmlDocumentFacade( Document document ) + { + this.document = document; + + html = document.createElement( "html" ); + document.appendChild( html ); + + body = document.createElement( "body" ); + head = document.createElement( "head" ); + + html.appendChild( head ); + html.appendChild( body ); + } + + public Element createHyperlink( String internalDestination ) + { + final Element basicLink = document.createElement( "a" ); + basicLink.setAttribute( "href", internalDestination ); + return basicLink; + } + + public Element createListItem() + { + return document.createElement( "li" ); + } + + public Element createParagraph() + { + return document.createElement( "p" ); + } + + public Element createTable() + { + return document.createElement( "table" ); + } + + public Element createTableBody() + { + return document.createElement( "tbody" ); + } + + public Element createTableCell() + { + return document.createElement( "td" ); + } + + public Element createTableHeader() + { + return document.createElement( "thead" ); + } + + public Element createTableHeaderCell() + { + return document.createElement( "th" ); + } + + public Element createTableRow() + { + return document.createElement( "tr" ); + } + + public Text createText( String data ) + { + return document.createTextNode( data ); + } + + public Element createUnorderedList() + { + return document.createElement( "ul" ); + } + + public Document getDocument() + { + return document; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java index 4189d7c32d..67f6bb17d1 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java @@ -1,32 +1,27 @@ -/* - * ==================================================================== - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ==================================================================== - */ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ package org.apache.poi.hwpf.extractor; import java.io.File; -import java.io.FileInputStream; import java.io.FileWriter; -import java.io.IOException; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Stack; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; @@ -36,8 +31,10 @@ import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.model.ListFormatOverride; import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.usermodel.BorderCode; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Picture; @@ -54,12 +51,10 @@ import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Text; -import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH; - /** * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) */ -public class WordToFoExtractor extends AbstractToFoExtractor +public class WordToFoExtractor extends AbstractWordExtractor { /** @@ -84,35 +79,55 @@ public class WordToFoExtractor extends AbstractToFoExtractor } } - private static final byte BEL_MARK = 7; - - private static final byte FIELD_BEGIN_MARK = 19; - - private static final byte FIELD_END_MARK = 21; - - private static final byte FIELD_SEPARATOR_MARK = 20; - private static final POILogger logger = POILogFactory .getLogger( WordToFoExtractor.class ); - private static HWPFDocument loadDoc( File docFile ) throws IOException + public static String getBorderType( BorderCode borderCode ) { - final FileInputStream istream = new FileInputStream( docFile ); - try + if ( borderCode == null ) + throw new IllegalArgumentException( "borderCode is null" ); + + switch ( borderCode.getBorderType() ) { - return new HWPFDocument( istream ); - } - finally - { - try - { - istream.close(); - } - catch ( Exception exc ) - { - logger.log( POILogger.ERROR, - "Unable to close FileInputStream: " + exc, exc ); - } + case 1: + case 2: + return "solid"; + case 3: + return "double"; + case 5: + return "solid"; + case 6: + return "dotted"; + case 7: + case 8: + return "dashed"; + case 9: + return "dotted"; + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + case 16: + case 17: + case 18: + case 19: + return "double"; + case 20: + return "solid"; + case 21: + return "double"; + case 22: + return "dashed"; + case 23: + return "dashed"; + case 24: + return "ridge"; + case 25: + return "grooved"; + default: + return "solid"; } } @@ -160,7 +175,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor static Document process( File docFile ) throws Exception { - final HWPFDocument hwpfDocument = loadDoc( docFile ); + final HWPFDocumentCore hwpfDocument = WordToFoUtils.loadDoc( docFile ); WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( DocumentBuilderFactory.newInstance().newDocumentBuilder() .newDocument() ); @@ -170,6 +185,8 @@ public class WordToFoExtractor extends AbstractToFoExtractor private final Stack blocksProperies = new Stack(); + protected final FoDocumentFacade foDocumentFacade; + /** * Creates new instance of {@link WordToFoExtractor}. Can be used for output * several {@link HWPFDocument}s into single FO document. @@ -180,27 +197,28 @@ public class WordToFoExtractor extends AbstractToFoExtractor */ public WordToFoExtractor( Document document ) { - super( document ); + this.foDocumentFacade = new FoDocumentFacade( document ); } protected String createPageMaster( SectionProperties sep, String type, int section ) { - float height = sep.getYaPage() / TWIPS_PER_INCH; - float width = sep.getXaPage() / TWIPS_PER_INCH; - float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH; - float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH; - float topMargin = sep.getDyaTop() / TWIPS_PER_INCH; - float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH; + float height = sep.getYaPage() / WordToFoUtils.TWIPS_PER_INCH; + float width = sep.getXaPage() / WordToFoUtils.TWIPS_PER_INCH; + float leftMargin = sep.getDxaLeft() / WordToFoUtils.TWIPS_PER_INCH; + float rightMargin = sep.getDxaRight() / WordToFoUtils.TWIPS_PER_INCH; + float topMargin = sep.getDyaTop() / WordToFoUtils.TWIPS_PER_INCH; + float bottomMargin = sep.getDyaBottom() / WordToFoUtils.TWIPS_PER_INCH; // add these to the header String pageMasterName = type + "-page" + section; - Element pageMaster = addSimplePageMaster( pageMasterName ); + Element pageMaster = foDocumentFacade + .addSimplePageMaster( pageMasterName ); pageMaster.setAttribute( "page-height", height + "in" ); pageMaster.setAttribute( "page-width", width + "in" ); - Element regionBody = addRegionBody( pageMaster ); + Element regionBody = foDocumentFacade.addRegionBody( pageMaster ); regionBody.setAttribute( "margin", topMargin + "in " + rightMargin + "in " + bottomMargin + "in " + leftMargin + "in" ); @@ -216,12 +234,13 @@ public class WordToFoExtractor extends AbstractToFoExtractor if ( sep.getCcolM1() > 0 ) { - regionBody - .setAttribute( "column-count", "" + (sep.getCcolM1() + 1) ); + regionBody.setAttribute( "column-count", "" + + ( sep.getCcolM1() + 1 ) ); if ( sep.getFEvenlySpaced() ) { regionBody.setAttribute( "column-gap", - (sep.getDxaColumns() / TWIPS_PER_INCH) + "in" ); + ( sep.getDxaColumns() / WordToFoUtils.TWIPS_PER_INCH ) + + "in" ); } else { @@ -232,171 +251,55 @@ public class WordToFoExtractor extends AbstractToFoExtractor return pageMasterName; } - protected boolean processCharacters( HWPFDocument hwpfDocument, - int currentTableLevel, Paragraph paragraph, final Element block, - final int start, final int end ) + public Document getDocument() { - boolean haveAnyText = false; - - for ( int c = start; c < end; c++ ) - { - CharacterRun characterRun = paragraph.getCharacterRun( c ); - - if ( hwpfDocument.getPicturesTable().hasPicture( characterRun ) ) - { - Picture picture = hwpfDocument.getPicturesTable() - .extractPicture( characterRun, true ); - - processImage( block, characterRun.text().charAt( 0 ) == 0x01, - picture ); - continue; - } - - String text = characterRun.text(); - if ( text.getBytes().length == 0 ) - continue; - - if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) - { - int skipTo = tryField( hwpfDocument, paragraph, - currentTableLevel, c, block ); - - if ( skipTo != c ) - { - c = skipTo; - continue; - } - - continue; - } - if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) - { - // shall not appear without FIELD_BEGIN_MARK - continue; - } - if ( text.getBytes()[0] == FIELD_END_MARK ) - { - // shall not appear without FIELD_BEGIN_MARK - continue; - } - - if ( characterRun.isSpecialCharacter() || characterRun.isObj() - || characterRun.isOle2() ) - { - continue; - } - - BlockProperies blockProperies = this.blocksProperies.peek(); - Element inline = createInline(); - if ( characterRun.isBold() != blockProperies.pBold ) - { - WordToFoUtils.setBold( inline, characterRun.isBold() ); - } - if ( characterRun.isItalic() != blockProperies.pItalic ) - { - WordToFoUtils.setItalic( inline, characterRun.isItalic() ); - } - if ( !WordToFoUtils.equals( characterRun.getFontName(), - blockProperies.pFontName ) ) - { - WordToFoUtils - .setFontFamily( inline, characterRun.getFontName() ); - } - if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize ) - { - WordToFoUtils.setFontSize( inline, - characterRun.getFontSize() / 2 ); - } - WordToFoUtils.setCharactersProperties( characterRun, inline ); - block.appendChild( inline ); - - if ( text.endsWith( "\r" ) - || (text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0) ) - text = text.substring( 0, text.length() - 1 ); - - Text textNode = createText( text ); - inline.appendChild( textNode ); - - haveAnyText |= text.trim().length() != 0; - } - - return haveAnyText; + return foDocumentFacade.getDocument(); } - public void processDocument( HWPFDocument hwpfDocument ) + @Override + protected void outputCharacters( Element block, CharacterRun characterRun, + String text ) { - final Range range = hwpfDocument.getRange(); - - for ( int s = 0; s < range.numSections(); s++ ) + BlockProperies blockProperies = this.blocksProperies.peek(); + Element inline = foDocumentFacade.createInline(); + if ( characterRun.isBold() != blockProperies.pBold ) { - processSection( hwpfDocument, range.getSection( s ), s ); + WordToFoUtils.setBold( inline, characterRun.isBold() ); } - } - - protected void processField( HWPFDocument hwpfDocument, - Element currentBlock, Paragraph paragraph, int currentTableLevel, - int beginMark, int separatorMark, int endMark ) - { - - Pattern hyperlinkPattern = Pattern - .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" ); - Pattern pagerefPattern = Pattern - .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" ); - - if ( separatorMark - beginMark > 1 ) + if ( characterRun.isItalic() != blockProperies.pItalic ) { - CharacterRun firstAfterBegin = paragraph - .getCharacterRun( beginMark + 1 ); - - final Matcher hyperlinkMatcher = hyperlinkPattern - .matcher( firstAfterBegin.text() ); - if ( hyperlinkMatcher.matches() ) - { - String hyperlink = hyperlinkMatcher.group( 1 ); - processHyperlink( hwpfDocument, currentBlock, paragraph, - currentTableLevel, hyperlink, separatorMark + 1, - endMark ); - return; - } - - final Matcher pagerefMatcher = pagerefPattern - .matcher( firstAfterBegin.text() ); - if ( pagerefMatcher.matches() ) - { - String pageref = pagerefMatcher.group( 1 ); - processPageref( hwpfDocument, currentBlock, paragraph, - currentTableLevel, pageref, separatorMark + 1, endMark ); - return; - } + WordToFoUtils.setItalic( inline, characterRun.isItalic() ); } - - StringBuilder debug = new StringBuilder( "Unsupported field type: \n" ); - for ( int i = beginMark; i <= endMark; i++ ) + if ( characterRun.getFontName() != null + && !AbstractWordUtils.equals( characterRun.getFontName(), + blockProperies.pFontName ) ) { - debug.append( "\t" ); - debug.append( paragraph.getCharacterRun( i ) ); - debug.append( "\n" ); + WordToFoUtils.setFontFamily( inline, characterRun.getFontName() ); } - logger.log( POILogger.WARN, debug ); - - // just output field value - if ( separatorMark + 1 < endMark ) - processCharacters( hwpfDocument, currentTableLevel, paragraph, - currentBlock, separatorMark + 1, endMark ); + if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize ) + { + WordToFoUtils.setFontSize( inline, characterRun.getFontSize() / 2 ); + } + WordToFoUtils.setCharactersProperties( characterRun, inline ); + block.appendChild( inline ); - return; + Text textNode = foDocumentFacade.createText( text ); + inline.appendChild( textNode ); } - protected void processHyperlink( HWPFDocument hwpfDocument, - Element currentBlock, Paragraph paragraph, int currentTableLevel, + protected void processHyperlink( HWPFDocumentCore hwpfDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, String hyperlink, int beginTextInclusive, int endTextExclusive ) { - Element basicLink = createBasicLinkExternal( hyperlink ); + Element basicLink = foDocumentFacade + .createBasicLinkExternal( hyperlink ); currentBlock.appendChild( basicLink ); if ( beginTextInclusive < endTextExclusive ) processCharacters( hwpfDocument, currentTableLevel, paragraph, - basicLink, beginTextInclusive, endTextExclusive ); + basicLink, characterRuns, beginTextInclusive, + endTextExclusive ); } /** @@ -422,27 +325,30 @@ public class WordToFoExtractor extends AbstractToFoExtractor Picture picture ) { // no default implementation -- skip - currentBlock.appendChild( document.createComment( "Image link to '" - + picture.suggestFullFileName() + "' can be here" ) ); + currentBlock.appendChild( foDocumentFacade.getDocument().createComment( + "Image link to '" + picture.suggestFullFileName() + + "' can be here" ) ); } - protected void processPageref( HWPFDocument hwpfDocument, - Element currentBlock, Paragraph paragraph, int currentTableLevel, + protected void processPageref( HWPFDocumentCore hwpfDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, String pageref, int beginTextInclusive, int endTextExclusive ) { - Element basicLink = createBasicLinkInternal( pageref ); + Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref ); currentBlock.appendChild( basicLink ); if ( beginTextInclusive < endTextExclusive ) processCharacters( hwpfDocument, currentTableLevel, paragraph, - basicLink, beginTextInclusive, endTextExclusive ); + basicLink, characterRuns, beginTextInclusive, + endTextExclusive ); } - protected void processParagraph( HWPFDocument hwpfDocument, + protected void processParagraph( HWPFDocumentCore hwpfDocument, Element parentFopElement, int currentTableLevel, Paragraph paragraph, String bulletText ) { - final Element block = createBlock(); + final Element block = foDocumentFacade.createBlock(); parentFopElement.appendChild( block ); WordToFoUtils.setParagraphProperties( paragraph, block ); @@ -480,21 +386,23 @@ public class WordToFoExtractor extends AbstractToFoExtractor if ( WordToFoUtils.isNotEmpty( bulletText ) ) { - Element inline = createInline(); + Element inline = foDocumentFacade.createInline(); block.appendChild( inline ); - Text textNode = createText( bulletText ); + Text textNode = foDocumentFacade.createText( bulletText ); inline.appendChild( textNode ); haveAnyText |= bulletText.trim().length() != 0; } + List characterRuns = WordToFoUtils + .findCharacterRuns( paragraph ); haveAnyText = processCharacters( hwpfDocument, currentTableLevel, - paragraph, block, 0, charRuns ); + paragraph, block, characterRuns, 0, characterRuns.size() ); if ( !haveAnyText ) { - Element leader = createLeader(); + Element leader = foDocumentFacade.createLeader(); block.appendChild( leader ); } } @@ -506,20 +414,21 @@ public class WordToFoExtractor extends AbstractToFoExtractor return; } - protected void processSection( HWPFDocument hwpfDocument, Section section, - int sectionCounter ) + protected void processSection( HWPFDocumentCore wordDocument, + Section section, int sectionCounter ) { String regularPage = createPageMaster( WordToFoUtils.getSectionProperties( section ), "page", sectionCounter ); - Element pageSequence = addPageSequence( regularPage ); - Element flow = addFlowToPageSequence( pageSequence, "xsl-region-body" ); + Element pageSequence = foDocumentFacade.addPageSequence( regularPage ); + Element flow = foDocumentFacade.addFlowToPageSequence( pageSequence, + "xsl-region-body" ); - processSectionParagraphes( hwpfDocument, flow, section, 0 ); + processSectionParagraphes( wordDocument, flow, section, 0 ); } - protected void processSectionParagraphes( HWPFDocument hwpfDocument, + protected void processSectionParagraphes( HWPFDocument wordDocument, Element flow, Range range, int currentTableLevel ) { final Map allTables = new HashMap(); @@ -530,7 +439,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor allTables.put( Integer.valueOf( next.getStartOffset() ), next ); } - final ListTables listTables = hwpfDocument.getListTables(); + final ListTables listTables = wordDocument.getListTables(); int currentListInfo = 0; final int paragraphs = range.numParagraphs(); @@ -543,7 +452,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor { Table table = allTables.get( Integer.valueOf( paragraph .getStartOffset() ) ); - processTable( hwpfDocument, flow, table, currentTableLevel + 1 ); + processTable( wordDocument, flow, table, currentTableLevel + 1 ); continue; } @@ -568,7 +477,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor String label = WordToFoUtils.getBulletText( listTables, paragraph, listFormatOverride.getLsid() ); - processParagraph( hwpfDocument, flow, currentTableLevel, + processParagraph( wordDocument, flow, currentTableLevel, paragraph, label ); } else @@ -580,24 +489,24 @@ public class WordToFoExtractor extends AbstractToFoExtractor + currentListInfo + ", but listTables not defined in file" ); - processParagraph( hwpfDocument, flow, currentTableLevel, + processParagraph( wordDocument, flow, currentTableLevel, paragraph, WordToFoUtils.EMPTY ); } } else { - processParagraph( hwpfDocument, flow, currentTableLevel, + processParagraph( wordDocument, flow, currentTableLevel, paragraph, WordToFoUtils.EMPTY ); } } } - protected void processTable( HWPFDocument hwpfDocument, Element flow, + protected void processTable( HWPFDocumentCore wordDocument, Element flow, Table table, int thisTableLevel ) { - Element tableHeader = createTableHeader(); - Element tableBody = createTableBody(); + Element tableHeader = foDocumentFacade.createTableHeader(); + Element tableBody = foDocumentFacade.createTableBody(); final int tableRows = table.numRows(); @@ -611,7 +520,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor { TableRow tableRow = table.getRow( r ); - Element tableRowElement = createTableRow(); + Element tableRowElement = foDocumentFacade.createTableRow(); WordToFoUtils.setTableRowProperties( tableRow, tableRowElement ); final int rowCells = tableRow.numCells(); @@ -626,7 +535,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor && !tableCell.isFirstVerticallyMerged() ) continue; - Element tableCellElement = createTableCell(); + Element tableCellElement = foDocumentFacade.createTableCell(); WordToFoUtils.setTableCellProperties( tableRow, tableCell, tableCellElement, r == 0, r == tableRows - 1, c == 0, c == rowCells - 1 ); @@ -649,9 +558,9 @@ public class WordToFoExtractor extends AbstractToFoExtractor { if ( c == rowCells - 1 && c != maxColumns - 1 ) { - tableCellElement - .setAttribute( "number-columns-spanned", "" - + (maxColumns - c) ); + tableCellElement.setAttribute( + "number-columns-spanned", "" + + ( maxColumns - c ) ); } } @@ -673,12 +582,13 @@ public class WordToFoExtractor extends AbstractToFoExtractor + count ); } - processSectionParagraphes( hwpfDocument, tableCellElement, + processSectionParagraphes( wordDocument, tableCellElement, tableCell, thisTableLevel ); if ( !tableCellElement.hasChildNodes() ) { - tableCellElement.appendChild( createBlock() ); + tableCellElement.appendChild( foDocumentFacade + .createBlock() ); } tableRowElement.appendChild( tableCellElement ); @@ -694,7 +604,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor } } - final Element tableElement = createTable(); + final Element tableElement = foDocumentFacade.createTable(); if ( tableHeader.hasChildNodes() ) { tableElement.appendChild( tableHeader ); @@ -714,51 +624,4 @@ public class WordToFoExtractor extends AbstractToFoExtractor } } - protected int tryField( HWPFDocument hwpfDocument, Paragraph paragraph, - int currentTableLevel, int beginMark, Element currentBlock ) - { - int separatorMark = -1; - int endMark = -1; - for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ ) - { - CharacterRun characterRun = paragraph.getCharacterRun( c ); - - String text = characterRun.text(); - if ( text.getBytes().length == 0 ) - continue; - - if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) - { - if ( separatorMark != -1 ) - { - // double; - return beginMark; - } - - separatorMark = c; - continue; - } - - if ( text.getBytes()[0] == FIELD_END_MARK ) - { - if ( endMark != -1 ) - { - // double; - return beginMark; - } - - endMark = c; - break; - } - - } - - if ( separatorMark == -1 || endMark == -1 ) - return beginMark; - - processField( hwpfDocument, currentBlock, paragraph, currentTableLevel, - beginMark, separatorMark, endMark ); - - return endMark; - } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java index 5acd711138..1b3447f006 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java @@ -1,489 +1,323 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ package org.apache.poi.hwpf.extractor; -import java.lang.reflect.Constructor; -import java.lang.reflect.Field; - -import org.apache.poi.hwpf.model.ListLevel; -import org.apache.poi.hwpf.model.ListTables; import org.apache.poi.hwpf.usermodel.BorderCode; import org.apache.poi.hwpf.usermodel.CharacterProperties; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Picture; -import org.apache.poi.hwpf.usermodel.Range; -import org.apache.poi.hwpf.usermodel.Section; -import org.apache.poi.hwpf.usermodel.SectionProperties; import org.apache.poi.hwpf.usermodel.TableCell; -import org.apache.poi.hwpf.usermodel.TableIterator; import org.apache.poi.hwpf.usermodel.TableRow; import org.w3c.dom.Element; -public class WordToFoUtils { - static final String EMPTY = ""; - - public static final float TWIPS_PER_INCH = 1440.0f; - - public static final int TWIPS_PER_PT = 20; - - static boolean equals(String str1, String str2) { - return str1 == null ? str2 == null : str1.equals(str2); - } - - public static String getBorderType(BorderCode borderCode) { - if (borderCode == null) - throw new IllegalArgumentException("borderCode is null"); - - switch (borderCode.getBorderType()) { - case 1: - case 2: - return "solid"; - case 3: - return "double"; - case 5: - return "solid"; - case 6: - return "dotted"; - case 7: - case 8: - return "dashed"; - case 9: - return "dotted"; - case 10: - case 11: - case 12: - case 13: - case 14: - case 15: - case 16: - case 17: - case 18: - case 19: - return "double"; - case 20: - return "solid"; - case 21: - return "double"; - case 22: - return "dashed"; - case 23: - return "dashed"; - case 24: - return "ridge"; - case 25: - return "grooved"; - default: - return "solid"; - } - } - - public static String getBorderWidth(BorderCode borderCode) { - int lineWidth = borderCode.getLineWidth(); - int pt = lineWidth / 8; - int pte = lineWidth - pt * 8; - - StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append(pt); - stringBuilder.append("."); - stringBuilder.append(1000 / 8 * pte); - stringBuilder.append("pt"); - return stringBuilder.toString(); - } - - public static String getBulletText(ListTables listTables, - Paragraph paragraph, int listId) { - final ListLevel listLevel = listTables.getLevel(listId, - paragraph.getIlvl()); - - if (listLevel.getNumberText() == null) - return EMPTY; - - StringBuffer bulletBuffer = new StringBuffer(); - char[] xst = listLevel.getNumberText().toCharArray(); - for (char element : xst) { - if (element < 9) { - ListLevel numLevel = listTables.getLevel(listId, element); - - int num = numLevel.getStartAt(); - bulletBuffer.append(NumberFormatter.getNumber(num, - listLevel.getNumberFormat())); - - if (numLevel == listLevel) { - numLevel.setStartAt(numLevel.getStartAt() + 1); - } - - } else { - bulletBuffer.append(element); - } - } - - byte follow = getIxchFollow(listLevel); - switch (follow) { - case 0: - bulletBuffer.append("\t"); - break; - case 1: - bulletBuffer.append(" "); - break; - default: - break; - } - - return bulletBuffer.toString(); - } - - public static String getColor(int ico) { - switch (ico) { - case 1: - return "black"; - case 2: - return "blue"; - case 3: - return "cyan"; - case 4: - return "green"; - case 5: - return "magenta"; - case 6: - return "red"; - case 7: - return "yellow"; - case 8: - return "white"; - case 9: - return "darkblue"; - case 10: - return "darkcyan"; - case 11: - return "darkgreen"; - case 12: - return "darkmagenta"; - case 13: - return "darkred"; - case 14: - return "darkyellow"; - case 15: - return "darkgray"; - case 16: - return "lightgray"; - default: - return "black"; - } +public class WordToFoUtils extends AbstractWordUtils +{ + public static void setBold( final Element element, final boolean bold ) + { + element.setAttribute( "font-weight", bold ? "bold" : "normal" ); } - public static byte getIxchFollow(ListLevel listLevel) { - try { - Field field = ListLevel.class.getDeclaredField("_ixchFollow"); - field.setAccessible(true); - return ((Byte) field.get(listLevel)).byteValue(); - } catch (Exception exc) { - throw new Error(exc); - } - } - - public static String getJustification(int js) { - switch (js) { - case 0: - return "start"; - case 1: - return "center"; - case 2: - return "end"; - case 3: - case 4: - return "justify"; - case 5: - return "center"; - case 6: - return "left"; - case 7: - return "start"; - case 8: - return "end"; - case 9: - return "justify"; + public static void setBorder( Element element, BorderCode borderCode, + String where ) + { + if ( element == null ) + throw new IllegalArgumentException( "element is null" ); + + if ( borderCode == null || borderCode.getBorderType() == 0 ) + return; + + if ( isEmpty( where ) ) + { + element.setAttribute( "border-style", getBorderType( borderCode ) ); + element.setAttribute( "border-color", + getColor( borderCode.getColor() ) ); + element.setAttribute( "border-width", getBorderWidth( borderCode ) ); + } + else + { + element.setAttribute( "border-" + where + "-style", + getBorderType( borderCode ) ); + element.setAttribute( "border-" + where + "-color", + getColor( borderCode.getColor() ) ); + element.setAttribute( "border-" + where + "-width", + getBorderWidth( borderCode ) ); } - return ""; - } - - public static String getListItemNumberLabel(int number, int format) { - - if (format != 0) - System.err.println("NYI: toListItemNumberLabel(): " + format); - - return String.valueOf(number); - } - - public static SectionProperties getSectionProperties(Section section) { - try { - Field field = Section.class.getDeclaredField("_props"); - field.setAccessible(true); - return (SectionProperties) field.get(section); - } catch (Exception exc) { - throw new Error(exc); - } - } - - static boolean isEmpty(String str) { - return str == null || str.length() == 0; - } - - static boolean isNotEmpty(String str) { - return !isEmpty(str); - } - - public static TableIterator newTableIterator(Range range, int level) { - try { - Constructor constructor = TableIterator.class - .getDeclaredConstructor(Range.class, int.class); - constructor.setAccessible(true); - return constructor.newInstance(range, Integer.valueOf(level)); - } catch (Exception exc) { - throw new Error(exc); - } - } - - public static void setBold(final Element element, final boolean bold) { - element.setAttribute("font-weight", bold ? "bold" : "normal"); - } - - public static void setBorder(Element element, BorderCode borderCode, - String where) { - if (element == null) - throw new IllegalArgumentException("element is null"); - - if (borderCode == null) - return; - - if (isEmpty(where)) { - element.setAttribute("border-style", getBorderType(borderCode)); - element.setAttribute("border-color", - getColor(borderCode.getColor())); - element.setAttribute("border-width", getBorderWidth(borderCode)); - } else { - element.setAttribute("border-" + where + "-style", - getBorderType(borderCode)); - element.setAttribute("border-" + where + "-color", - getColor(borderCode.getColor())); - element.setAttribute("border-" + where + "-width", - getBorderWidth(borderCode)); - } } - public static void setCharactersProperties(final CharacterRun characterRun, - final Element inline) { + public static void setCharactersProperties( + final CharacterRun characterRun, final Element inline ) + { final CharacterProperties clonedProperties = characterRun .cloneProperties(); StringBuilder textDecorations = new StringBuilder(); - setBorder(inline, clonedProperties.getBrc(), EMPTY); + setBorder( inline, clonedProperties.getBrc(), EMPTY ); - if (characterRun.isCapitalized()) { - inline.setAttribute("text-transform", "uppercase"); + if ( characterRun.isCapitalized() ) + { + inline.setAttribute( "text-transform", "uppercase" ); } - if (characterRun.isHighlighted()) { - inline.setAttribute("background-color", - getColor(clonedProperties.getIcoHighlight())); + if ( characterRun.isHighlighted() ) + { + inline.setAttribute( "background-color", + getColor( clonedProperties.getIcoHighlight() ) ); } - if (characterRun.isStrikeThrough()) { - if (textDecorations.length() > 0) - textDecorations.append(" "); - textDecorations.append("line-through"); + if ( characterRun.isStrikeThrough() ) + { + if ( textDecorations.length() > 0 ) + textDecorations.append( " " ); + textDecorations.append( "line-through" ); } - if (characterRun.isShadowed()) { - inline.setAttribute("text-shadow", characterRun.getFontSize() / 24 - + "pt"); + if ( characterRun.isShadowed() ) + { + inline.setAttribute( "text-shadow", characterRun.getFontSize() / 24 + + "pt" ); } - if (characterRun.isSmallCaps()) { - inline.setAttribute("font-variant", "small-caps"); + if ( characterRun.isSmallCaps() ) + { + inline.setAttribute( "font-variant", "small-caps" ); } - if (characterRun.getSubSuperScriptIndex() == 1) { - inline.setAttribute("baseline-shift", "super"); - inline.setAttribute("font-size", "smaller"); + if ( characterRun.getSubSuperScriptIndex() == 1 ) + { + inline.setAttribute( "baseline-shift", "super" ); + inline.setAttribute( "font-size", "smaller" ); } - if (characterRun.getSubSuperScriptIndex() == 2) { - inline.setAttribute("baseline-shift", "sub"); - inline.setAttribute("font-size", "smaller"); + if ( characterRun.getSubSuperScriptIndex() == 2 ) + { + inline.setAttribute( "baseline-shift", "sub" ); + inline.setAttribute( "font-size", "smaller" ); } - if (characterRun.getUnderlineCode() > 0) { - if (textDecorations.length() > 0) - textDecorations.append(" "); - textDecorations.append("underline"); + if ( characterRun.getUnderlineCode() > 0 ) + { + if ( textDecorations.length() > 0 ) + textDecorations.append( " " ); + textDecorations.append( "underline" ); } - if (characterRun.isVanished()) { - inline.setAttribute("visibility", "hidden"); + if ( characterRun.isVanished() ) + { + inline.setAttribute( "visibility", "hidden" ); } - if (textDecorations.length() > 0) { - inline.setAttribute("text-decoration", textDecorations.toString()); + if ( textDecorations.length() > 0 ) + { + inline.setAttribute( "text-decoration", textDecorations.toString() ); } } - public static void setFontFamily(final Element element, - final String fontFamily) { - element.setAttribute("font-family", fontFamily); + public static void setFontFamily( final Element element, + final String fontFamily ) + { + if ( isEmpty( fontFamily ) ) + return; + + element.setAttribute( "font-family", fontFamily ); } - public static void setFontSize(final Element element, final int fontSize) { - element.setAttribute("font-size", String.valueOf(fontSize)); + public static void setFontSize( final Element element, final int fontSize ) + { + element.setAttribute( "font-size", String.valueOf( fontSize ) ); } - public static void setIndent(Paragraph paragraph, Element block) { - if (paragraph.getFirstLineIndent() != 0) { - block.setAttribute( - "text-indent", - String.valueOf(paragraph.getFirstLineIndent() - / TWIPS_PER_PT) - + "pt"); - } - if (paragraph.getIndentFromLeft() != 0) { - block.setAttribute( - "start-indent", - String.valueOf(paragraph.getIndentFromLeft() / TWIPS_PER_PT) - + "pt"); - } - if (paragraph.getIndentFromRight() != 0) { - block.setAttribute( - "end-indent", - String.valueOf(paragraph.getIndentFromRight() - / TWIPS_PER_PT) - + "pt"); - } - if (paragraph.getSpacingBefore() != 0) { - block.setAttribute("space-before", - String.valueOf(paragraph.getSpacingBefore() / TWIPS_PER_PT) - + "pt"); - } - if (paragraph.getSpacingAfter() != 0) { - block.setAttribute("space-after", - String.valueOf(paragraph.getSpacingAfter() / TWIPS_PER_PT) - + "pt"); - } + public static void setIndent( Paragraph paragraph, Element block ) + { + if ( paragraph.getFirstLineIndent() != 0 ) + { + block.setAttribute( + "text-indent", + String.valueOf( paragraph.getFirstLineIndent() + / TWIPS_PER_PT ) + + "pt" ); + } + if ( paragraph.getIndentFromLeft() != 0 ) + { + block.setAttribute( + "start-indent", + String.valueOf( paragraph.getIndentFromLeft() + / TWIPS_PER_PT ) + + "pt" ); + } + if ( paragraph.getIndentFromRight() != 0 ) + { + block.setAttribute( + "end-indent", + String.valueOf( paragraph.getIndentFromRight() + / TWIPS_PER_PT ) + + "pt" ); + } + if ( paragraph.getSpacingBefore() != 0 ) + { + block.setAttribute( + "space-before", + String.valueOf( paragraph.getSpacingBefore() / TWIPS_PER_PT ) + + "pt" ); + } + if ( paragraph.getSpacingAfter() != 0 ) + { + block.setAttribute( "space-after", + String.valueOf( paragraph.getSpacingAfter() / TWIPS_PER_PT ) + + "pt" ); + } } - public static void setItalic(final Element element, final boolean italic) { - element.setAttribute("font-style", italic ? "italic" : "normal"); + public static void setItalic( final Element element, final boolean italic ) + { + element.setAttribute( "font-style", italic ? "italic" : "normal" ); } - public static void setJustification(Paragraph paragraph, - final Element element) { - String justification = getJustification(paragraph.getJustification()); - if (isNotEmpty(justification)) - element.setAttribute("text-align", justification); + public static void setJustification( Paragraph paragraph, + final Element element ) + { + String justification = getJustification( paragraph.getJustification() ); + if ( isNotEmpty( justification ) ) + element.setAttribute( "text-align", justification ); } - public static void setParagraphProperties(Paragraph paragraph, Element block) { - setIndent(paragraph, block); - setJustification(paragraph, block); + public static void setParagraphProperties( Paragraph paragraph, + Element block ) + { + setIndent( paragraph, block ); + setJustification( paragraph, block ); - setBorder(block, paragraph.getBottomBorder(), "bottom"); - setBorder(block, paragraph.getLeftBorder(), "left"); - setBorder(block, paragraph.getRightBorder(), "right"); - setBorder(block, paragraph.getTopBorder(), "top"); + setBorder( block, paragraph.getBottomBorder(), "bottom" ); + setBorder( block, paragraph.getLeftBorder(), "left" ); + setBorder( block, paragraph.getRightBorder(), "right" ); + setBorder( block, paragraph.getTopBorder(), "top" ); - if (paragraph.pageBreakBefore()) { - block.setAttribute("break-before", "page"); - } + if ( paragraph.pageBreakBefore() ) + { + block.setAttribute( "break-before", "page" ); + } - block.setAttribute("hyphenate", - String.valueOf(paragraph.isAutoHyphenated())); + block.setAttribute( "hyphenate", + String.valueOf( paragraph.isAutoHyphenated() ) ); - if (paragraph.keepOnPage()) { - block.setAttribute("keep-together.within-page", "always"); - } + if ( paragraph.keepOnPage() ) + { + block.setAttribute( "keep-together.within-page", "always" ); + } - if (paragraph.keepWithNext()) { - block.setAttribute("keep-with-next.within-page", "always"); - } + if ( paragraph.keepWithNext() ) + { + block.setAttribute( "keep-with-next.within-page", "always" ); + } - block.setAttribute("linefeed-treatment", "preserve"); - block.setAttribute("white-space-collapse", "false"); + block.setAttribute( "linefeed-treatment", "preserve" ); + block.setAttribute( "white-space-collapse", "false" ); } - public static void setPictureProperties(Picture picture, - Element graphicElement) { + public static void setPictureProperties( Picture picture, + Element graphicElement ) + { final int aspectRatioX = picture.getAspectRatioX(); final int aspectRatioY = picture.getAspectRatioY(); - if (aspectRatioX > 0) { - graphicElement.setAttribute("content-width", ((picture.getDxaGoal() - * aspectRatioX / 100) / WordToFoUtils.TWIPS_PER_PT) - + "pt"); - } else - graphicElement.setAttribute("content-width", - (picture.getDxaGoal() / WordToFoUtils.TWIPS_PER_PT) + "pt"); + if ( aspectRatioX > 0 ) + { + graphicElement + .setAttribute( "content-width", ( ( picture.getDxaGoal() + * aspectRatioX / 100 ) / TWIPS_PER_PT ) + + "pt" ); + } + else + graphicElement.setAttribute( "content-width", + ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" ); - if (aspectRatioY > 0) + if ( aspectRatioY > 0 ) graphicElement - .setAttribute("content-height", ((picture.getDyaGoal() - * aspectRatioY / 100) / WordToFoUtils.TWIPS_PER_PT) - + "pt"); + .setAttribute( "content-height", ( ( picture.getDyaGoal() + * aspectRatioY / 100 ) / TWIPS_PER_PT ) + + "pt" ); else - graphicElement.setAttribute("content-height", - (picture.getDyaGoal() / WordToFoUtils.TWIPS_PER_PT) + "pt"); + graphicElement.setAttribute( "content-height", + ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" ); - if (aspectRatioX <= 0 || aspectRatioY <= 0) { - graphicElement.setAttribute("scaling", "uniform"); - } else { - graphicElement.setAttribute("scaling", "non-uniform"); + if ( aspectRatioX <= 0 || aspectRatioY <= 0 ) + { + graphicElement.setAttribute( "scaling", "uniform" ); + } + else + { + graphicElement.setAttribute( "scaling", "non-uniform" ); } - graphicElement.setAttribute("vertical-align", "text-bottom"); + graphicElement.setAttribute( "vertical-align", "text-bottom" ); - if (picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0 + if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0 || picture.getDyaCropBottom() != 0 - || picture.getDxaCropLeft() != 0) { - int rectTop = picture.getDyaCropTop() / WordToFoUtils.TWIPS_PER_PT; - int rectRight = picture.getDxaCropRight() - / WordToFoUtils.TWIPS_PER_PT; - int rectBottom = picture.getDyaCropBottom() - / WordToFoUtils.TWIPS_PER_PT; - int rectLeft = picture.getDxaCropLeft() - / WordToFoUtils.TWIPS_PER_PT; - graphicElement.setAttribute("clip", "rect(" + rectTop + "pt, " + || picture.getDxaCropLeft() != 0 ) + { + int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT; + int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT; + int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT; + int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT; + graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, " + rectRight + "pt, " + rectBottom + "pt, " + rectLeft - + "pt)"); - graphicElement.setAttribute("oveerflow", "hidden"); + + "pt)" ); + graphicElement.setAttribute( "oveerflow", "hidden" ); } } - public static void setTableCellProperties(TableRow tableRow, - TableCell tableCell, Element element, boolean toppest, - boolean bottomest, boolean leftest, boolean rightest) { - element.setAttribute("width", (tableCell.getWidth() / TWIPS_PER_INCH) - + "in"); - element.setAttribute("padding-start", - (tableRow.getGapHalf() / TWIPS_PER_INCH) + "in"); - element.setAttribute("padding-end", - (tableRow.getGapHalf() / TWIPS_PER_INCH) + "in"); - - BorderCode top = tableCell.getBrcTop() != null ? tableCell.getBrcTop() - : toppest ? tableRow.getTopBorder() : tableRow - .getHorizontalBorder(); - BorderCode bottom = tableCell.getBrcBottom() != null ? tableCell - .getBrcBottom() : bottomest ? tableRow.getBottomBorder() - : tableRow.getHorizontalBorder(); - - BorderCode left = tableCell.getBrcLeft() != null ? tableCell - .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow - .getVerticalBorder(); - BorderCode right = tableCell.getBrcRight() != null ? tableCell - .getBrcRight() : rightest ? tableRow.getRightBorder() - : tableRow.getVerticalBorder(); - - setBorder(element, bottom, "bottom"); - setBorder(element, left, "left"); - setBorder(element, right, "right"); - setBorder(element, top, "top"); + public static void setTableCellProperties( TableRow tableRow, + TableCell tableCell, Element element, boolean toppest, + boolean bottomest, boolean leftest, boolean rightest ) + { + element.setAttribute( "width", ( tableCell.getWidth() / TWIPS_PER_INCH ) + + "in" ); + element.setAttribute( "padding-start", + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" ); + element.setAttribute( "padding-end", + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" ); + + BorderCode top = tableCell.getBrcTop() != null + && tableCell.getBrcTop().getBorderType() != 0 ? tableCell + .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow + .getHorizontalBorder(); + BorderCode bottom = tableCell.getBrcBottom() != null + && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell + .getBrcBottom() : bottomest ? tableRow.getBottomBorder() + : tableRow.getHorizontalBorder(); + + BorderCode left = tableCell.getBrcLeft() != null + && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell + .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow + .getVerticalBorder(); + BorderCode right = tableCell.getBrcRight() != null + && tableCell.getBrcRight().getBorderType() != 0 ? tableCell + .getBrcRight() : rightest ? tableRow.getRightBorder() + : tableRow.getVerticalBorder(); + + setBorder( element, bottom, "bottom" ); + setBorder( element, left, "left" ); + setBorder( element, right, "right" ); + setBorder( element, top, "top" ); } - public static void setTableRowProperties(TableRow tableRow, - Element tableRowElement) { - if (tableRow.getRowHeight() > 0) { - tableRowElement.setAttribute("height", - (tableRow.getRowHeight() / TWIPS_PER_INCH) + "in"); - } - if (!tableRow.cantSplit()) { - tableRowElement.setAttribute("keep-together", "always"); - } + public static void setTableRowProperties( TableRow tableRow, + Element tableRowElement ) + { + if ( tableRow.getRowHeight() > 0 ) + { + tableRowElement.setAttribute( "height", + ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in" ); + } + if ( !tableRow.cantSplit() ) + { + tableRowElement.setAttribute( "keep-together", "always" ); + } } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java new file mode 100644 index 0000000000..6f27e443c8 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlExtractor.java @@ -0,0 +1,475 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.File; +import java.io.FileWriter; +import java.util.List; +import java.util.Stack; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.Section; +import org.apache.poi.hwpf.usermodel.SectionProperties; +import org.apache.poi.hwpf.usermodel.Table; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +import static org.apache.poi.hwpf.extractor.AbstractWordUtils.TWIPS_PER_INCH; + +/** + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ +public class WordToHtmlExtractor extends AbstractWordExtractor +{ + + /** + * Holds properties values, applied to current p element. Those + * properties shall not be doubled in children span elements. + */ + private static class BlockProperies + { + final String pFontName; + final int pFontSize; + + public BlockProperies( String pFontName, int pFontSize ) + { + this.pFontName = pFontName; + this.pFontSize = pFontSize; + } + } + + private static final POILogger logger = POILogFactory + .getLogger( WordToHtmlExtractor.class ); + + private static String getSectionStyle( Section section ) + { + SectionProperties sep = WordToHtmlUtils.getSectionProperties( section ); + + float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH; + float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH; + float topMargin = sep.getDyaTop() / TWIPS_PER_INCH; + float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH; + + String style = "margin: " + topMargin + "in " + rightMargin + "in " + + bottomMargin + "in " + leftMargin + "in; "; + + if ( sep.getCcolM1() > 0 ) + { + style += "column-count: " + ( sep.getCcolM1() + 1 ) + "; "; + if ( sep.getFEvenlySpaced() ) + { + style += "column-gap: " + + ( sep.getDxaColumns() / TWIPS_PER_INCH ) + "in; "; + } + else + { + style += "column-gap: 0.25in; "; + } + } + return style; + } + + /** + * Java main() interface to interact with WordToHtmlExtractor + * + *

+ * Usage: WordToHtmlExtractor infile outfile + *

+ * Where infile is an input .doc file ( Word 95-2007) which will be rendered + * as HTML into outfile + */ + public static void main( String[] args ) + { + if ( args.length < 2 ) + { + System.err + .println( "Usage: WordToHtmlExtractor " ); + return; + } + + System.out.println( "Converting " + args[0] ); + System.out.println( "Saving output to " + args[1] ); + try + { + Document doc = WordToHtmlExtractor.process( new File( args[0] ) ); + + FileWriter out = new FileWriter( args[1] ); + DOMSource domSource = new DOMSource( doc ); + StreamResult streamResult = new StreamResult( out ); + + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer serializer = tf.newTransformer(); + // TODO set encoding from a command argument + serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" ); + serializer.setOutputProperty( OutputKeys.INDENT, "yes" ); + serializer.setOutputProperty( OutputKeys.METHOD, "html" ); + serializer.transform( domSource, streamResult ); + out.close(); + } + catch ( Exception e ) + { + e.printStackTrace(); + } + } + + static Document process( File docFile ) throws Exception + { + final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile ); + WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToHtmlExtractor.processDocument( wordDocument ); + return wordToHtmlExtractor.getDocument(); + } + + private final Stack blocksProperies = new Stack(); + + private final HtmlDocumentFacade htmlDocumentFacade; + + /** + * Creates new instance of {@link WordToHtmlExtractor}. Can be used for + * output several {@link HWPFDocument}s into single HTML document. + * + * @param document + * XML DOM Document used as HTML document + */ + public WordToHtmlExtractor( Document document ) + { + this.htmlDocumentFacade = new HtmlDocumentFacade( document ); + } + + public Document getDocument() + { + return htmlDocumentFacade.getDocument(); + } + + @Override + protected void outputCharacters( Element pElement, + CharacterRun characterRun, String text ) + { + Element span = htmlDocumentFacade.document.createElement( "span" ); + pElement.appendChild( span ); + + StringBuilder style = new StringBuilder(); + BlockProperies blockProperies = this.blocksProperies.peek(); + if ( characterRun.getFontName() != null + && !WordToHtmlUtils.equals( characterRun.getFontName(), + blockProperies.pFontName ) ) + { + style.append( "font-family: " + characterRun.getFontName() + "; " ); + } + if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize ) + { + style.append( "font-size: " + characterRun.getFontSize() / 2 + "; " ); + } + + WordToHtmlUtils.addCharactersProperties( characterRun, style ); + if ( style.length() != 0 ) + span.setAttribute( "style", style.toString() ); + + Text textNode = htmlDocumentFacade.createText( text ); + span.appendChild( textNode ); + } + + protected void processHyperlink( HWPFDocumentCore wordDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String hyperlink, int beginTextInclusive, int endTextExclusive ) + { + Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink ); + currentBlock.appendChild( basicLink ); + + if ( beginTextInclusive < endTextExclusive ) + processCharacters( wordDocument, currentTableLevel, paragraph, + basicLink, characterRuns, beginTextInclusive, + endTextExclusive ); + } + + /** + * This method shall store image bytes in external file and convert it if + * necessary. Images shall be stored using PNG format. Other formats may be + * not supported by user browser. + *

+ * Please note the + * {@link WordToHtmlUtils#setPictureProperties(Picture, Element)} method. + * + * @param currentBlock + * currently processed HTML element, like p. Shall be + * used as parent of newly created img + * @param inlined + * if image is inlined + * @param picture + * HWPF object, contained picture data and properties + */ + protected void processImage( Element currentBlock, boolean inlined, + Picture picture ) + { + // no default implementation -- skip + currentBlock.appendChild( htmlDocumentFacade.document + .createComment( "Image link to '" + + picture.suggestFullFileName() + "' can be here" ) ); + } + + protected void processPageref( HWPFDocumentCore hwpfDocument, + Element currentBlock, Paragraph paragraph, + List characterRuns, int currentTableLevel, + String pageref, int beginTextInclusive, int endTextExclusive ) + { + Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref ); + currentBlock.appendChild( basicLink ); + + if ( beginTextInclusive < endTextExclusive ) + processCharacters( hwpfDocument, currentTableLevel, paragraph, + basicLink, characterRuns, beginTextInclusive, + endTextExclusive ); + } + + protected void processParagraph( HWPFDocumentCore hwpfDocument, + Element parentFopElement, int currentTableLevel, + Paragraph paragraph, String bulletText ) + { + final Element pElement = htmlDocumentFacade.createParagraph(); + parentFopElement.appendChild( pElement ); + + StringBuilder style = new StringBuilder(); + WordToHtmlUtils.addParagraphProperties( paragraph, style ); + + final int charRuns = paragraph.numCharacterRuns(); + + if ( charRuns == 0 ) + { + return; + } + + { + final String pFontName; + final int pFontSize; + final CharacterRun characterRun = paragraph.getCharacterRun( 0 ); + if ( characterRun != null ) + { + pFontSize = characterRun.getFontSize() / 2; + pFontName = characterRun.getFontName(); + WordToHtmlUtils.addFontFamily( pFontName, style ); + WordToHtmlUtils.addFontSize( pFontSize, style ); + } + else + { + pFontSize = -1; + pFontName = WordToHtmlUtils.EMPTY; + } + blocksProperies.push( new BlockProperies( pFontName, pFontSize ) ); + } + try + { + if ( WordToHtmlUtils.isNotEmpty( bulletText ) ) + { + Text textNode = htmlDocumentFacade.createText( bulletText ); + pElement.appendChild( textNode ); + } + + List characterRuns = WordToHtmlUtils + .findCharacterRuns( paragraph ); + processCharacters( hwpfDocument, currentTableLevel, paragraph, + pElement, characterRuns, 0, characterRuns.size() ); + } + finally + { + blocksProperies.pop(); + } + + if ( style.length() > 0 ) + pElement.setAttribute( "style", style.toString() ); + + return; + } + + protected void processSection( HWPFDocumentCore wordDocument, + Section section, int sectionCounter ) + { + Element div = htmlDocumentFacade.document.createElement( "div" ); + div.setAttribute( "style", getSectionStyle( section ) ); + htmlDocumentFacade.body.appendChild( div ); + + processSectionParagraphes( wordDocument, div, section, 0 ); + } + + @Override + protected void processSingleSection( HWPFDocumentCore wordDocument, + Section section ) + { + htmlDocumentFacade.body.setAttribute( "style", + getSectionStyle( section ) ); + + processSectionParagraphes( wordDocument, htmlDocumentFacade.body, + section, 0 ); + } + + protected void processTable( HWPFDocumentCore hwpfDocument, Element flow, + Table table, int thisTableLevel ) + { + Element tableHeader = htmlDocumentFacade.createTableHeader(); + Element tableBody = htmlDocumentFacade.createTableBody(); + + final int tableRows = table.numRows(); + + int maxColumns = Integer.MIN_VALUE; + for ( int r = 0; r < tableRows; r++ ) + { + maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() ); + } + + for ( int r = 0; r < tableRows; r++ ) + { + TableRow tableRow = table.getRow( r ); + + Element tableRowElement = htmlDocumentFacade.createTableRow(); + StringBuilder tableRowStyle = new StringBuilder(); + WordToHtmlUtils.addTableRowProperties( tableRow, tableRowStyle ); + + final int rowCells = tableRow.numCells(); + for ( int c = 0; c < rowCells; c++ ) + { + TableCell tableCell = tableRow.getCell( c ); + + if ( tableCell.isMerged() && !tableCell.isFirstMerged() ) + continue; + + if ( tableCell.isVerticallyMerged() + && !tableCell.isFirstVerticallyMerged() ) + continue; + + Element tableCellElement; + if ( tableRow.isTableHeader() ) + { + tableCellElement = htmlDocumentFacade + .createTableHeaderCell(); + } + else + { + tableCellElement = htmlDocumentFacade.createTableCell(); + } + StringBuilder tableCellStyle = new StringBuilder(); + WordToHtmlUtils.addTableCellProperties( tableRow, tableCell, + r == 0, r == tableRows - 1, c == 0, c == rowCells - 1, + tableCellStyle ); + + if ( tableCell.isFirstMerged() ) + { + int count = 0; + for ( int c1 = c; c1 < rowCells; c1++ ) + { + TableCell nextCell = tableRow.getCell( c1 ); + if ( nextCell.isMerged() ) + count++; + if ( !nextCell.isMerged() ) + break; + } + tableCellElement.setAttribute( "colspan", "" + count ); + } + else + { + if ( c == rowCells - 1 && c != maxColumns - 1 ) + { + tableCellElement.setAttribute( "colspan", "" + + ( maxColumns - c ) ); + } + } + + if ( tableCell.isFirstVerticallyMerged() ) + { + int count = 0; + for ( int r1 = r; r1 < tableRows; r1++ ) + { + TableRow nextRow = table.getRow( r1 ); + if ( nextRow.numCells() < c ) + break; + TableCell nextCell = nextRow.getCell( c ); + if ( nextCell.isVerticallyMerged() ) + count++; + if ( !nextCell.isVerticallyMerged() ) + break; + } + tableCellElement.setAttribute( "rowspan", "" + count ); + } + + processSectionParagraphes( hwpfDocument, tableCellElement, + tableCell, thisTableLevel ); + + if ( !tableCellElement.hasChildNodes() ) + { + tableCellElement.appendChild( htmlDocumentFacade + .createParagraph() ); + } + if ( tableCellStyle.length() > 0 ) + tableCellElement.setAttribute( "style", + tableCellStyle.toString() ); + + tableRowElement.appendChild( tableCellElement ); + } + + if ( tableRowStyle.length() > 0 ) + tableRowElement + .setAttribute( "style", tableRowStyle.toString() ); + + if ( tableRow.isTableHeader() ) + { + tableHeader.appendChild( tableRowElement ); + } + else + { + tableBody.appendChild( tableRowElement ); + } + + } + + final Element tableElement = htmlDocumentFacade.createTable(); + if ( tableHeader.hasChildNodes() ) + { + tableElement.appendChild( tableHeader ); + } + if ( tableBody.hasChildNodes() ) + { + tableElement.appendChild( tableBody ); + flow.appendChild( tableElement ); + } + else + { + logger.log( + POILogger.WARN, + "Table without body starting on offset " + + table.getStartOffset() + " -- " + + table.getEndOffset() ); + } + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java new file mode 100644 index 0000000000..4417f62017 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToHtmlUtils.java @@ -0,0 +1,292 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import org.apache.poi.hwpf.usermodel.BorderCode; +import org.apache.poi.hwpf.usermodel.CharacterProperties; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Picture; +import org.apache.poi.hwpf.usermodel.TableCell; +import org.apache.poi.hwpf.usermodel.TableRow; +import org.w3c.dom.Element; + +public class WordToHtmlUtils extends AbstractWordUtils +{ + public static void addBold( final boolean bold, StringBuilder style ) + { + style.append( "font-weight: " + ( bold ? "bold" : "normal" ) + ";" ); + } + + public static void addBorder( BorderCode borderCode, String where, + StringBuilder style ) + { + if ( borderCode == null || borderCode.getBorderType() == 0 ) + return; + + if ( isEmpty( where ) ) + { + style.append( "border-style: " + getBorderType( borderCode ) + "; " ); + style.append( "border-color: " + getColor( borderCode.getColor() ) + + "; " ); + style.append( "border-width: " + getBorderWidth( borderCode ) + + "; " ); + } + else + { + style.append( "border-" + where + "-style: " + + getBorderType( borderCode ) + "; " ); + style.append( "border-" + where + "-color: " + + getColor( borderCode.getColor() ) + "; " ); + style.append( "border-" + where + "-width: " + + getBorderWidth( borderCode ) + "; " ); + } + } + + public static void addCharactersProperties( + final CharacterRun characterRun, StringBuilder style ) + { + final CharacterProperties clonedProperties = characterRun + .cloneProperties(); + + if ( characterRun.isBold() ) + { + style.append( "font-weight: bold; " ); + } + if ( characterRun.isItalic() ) + { + style.append( "font-style: italic; " ); + } + + addBorder( clonedProperties.getBrc(), EMPTY, style ); + + if ( characterRun.isCapitalized() ) + { + style.append( "text-transform: uppercase; " ); + } + if ( characterRun.isHighlighted() ) + { + style.append( "background-color: " + + getColor( clonedProperties.getIcoHighlight() ) + "; " ); + } + if ( characterRun.isStrikeThrough() ) + { + style.append( "text-decoration: line-through; " ); + } + if ( characterRun.isShadowed() ) + { + style.append( "text-shadow: " + characterRun.getFontSize() / 24 + + "pt; " ); + } + if ( characterRun.isSmallCaps() ) + { + style.append( "font-variant: small-caps; " ); + } + if ( characterRun.getSubSuperScriptIndex() == 1 ) + { + style.append( "baseline-shift: super; " ); + style.append( "font-size: smaller; " ); + } + if ( characterRun.getSubSuperScriptIndex() == 2 ) + { + style.append( "baseline-shift: sub; " ); + style.append( "font-size: smaller; " ); + } + if ( characterRun.getUnderlineCode() > 0 ) + { + style.append( "text-decoration: underline; " ); + } + if ( characterRun.isVanished() ) + { + style.append( "visibility: hidden; " ); + } + } + + public static void addFontFamily( final String fontFamily, + StringBuilder style ) + { + if ( isEmpty( fontFamily ) ) + return; + + style.append( "font-family: " + fontFamily ); + } + + public static void addFontSize( final int fontSize, StringBuilder style ) + { + style.append( "font-size: " + fontSize ); + } + + public static void addIndent( Paragraph paragraph, StringBuilder style ) + { + addIndent( style, "text-indent", paragraph.getFirstLineIndent() ); + addIndent( style, "start-indent", paragraph.getIndentFromLeft() ); + addIndent( style, "end-indent", paragraph.getIndentFromRight() ); + addIndent( style, "space-before", paragraph.getSpacingBefore() ); + addIndent( style, "space-after", paragraph.getSpacingAfter() ); + } + + private static void addIndent( StringBuilder style, final String cssName, + final int twipsValue ) + { + if ( twipsValue == 0 ) + return; + + style.append( cssName + ": " + ( twipsValue / TWIPS_PER_PT ) + "pt; " ); + } + + public static void addJustification( Paragraph paragraph, + final StringBuilder style ) + { + String justification = getJustification( paragraph.getJustification() ); + if ( isNotEmpty( justification ) ) + style.append( "text-align: " + justification + "; " ); + } + + public static void addParagraphProperties( Paragraph paragraph, + StringBuilder style ) + { + addIndent( paragraph, style ); + addJustification( paragraph, style ); + + addBorder( paragraph.getBottomBorder(), "bottom", style ); + addBorder( paragraph.getLeftBorder(), "left", style ); + addBorder( paragraph.getRightBorder(), "right", style ); + addBorder( paragraph.getTopBorder(), "top", style ); + + if ( paragraph.pageBreakBefore() ) + { + style.append( "break-before: page; " ); + } + + style.append( "hyphenate: " + paragraph.isAutoHyphenated() + "; " ); + + if ( paragraph.keepOnPage() ) + { + style.append( "keep-together.within-page: always; " ); + } + + if ( paragraph.keepWithNext() ) + { + style.append( "keep-with-next.within-page: always; " ); + } + + style.append( "linefeed-treatment: preserve; " ); + style.append( "white-space-collapse: false; " ); + } + + public static void addTableCellProperties( TableRow tableRow, + TableCell tableCell, boolean toppest, boolean bottomest, + boolean leftest, boolean rightest, StringBuilder style ) + { + style.append( "width: " + ( tableCell.getWidth() / TWIPS_PER_INCH ) + + "in; " ); + style.append( "padding-start: " + + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " ); + style.append( "padding-end: " + + ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " ); + + BorderCode top = tableCell.getBrcTop() != null + && tableCell.getBrcTop().getBorderType() != 0 ? tableCell + .getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow + .getHorizontalBorder(); + BorderCode bottom = tableCell.getBrcBottom() != null + && tableCell.getBrcBottom().getBorderType() != 0 ? tableCell + .getBrcBottom() : bottomest ? tableRow.getBottomBorder() + : tableRow.getHorizontalBorder(); + + BorderCode left = tableCell.getBrcLeft() != null + && tableCell.getBrcLeft().getBorderType() != 0 ? tableCell + .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow + .getVerticalBorder(); + BorderCode right = tableCell.getBrcRight() != null + && tableCell.getBrcRight().getBorderType() != 0 ? tableCell + .getBrcRight() : rightest ? tableRow.getRightBorder() + : tableRow.getVerticalBorder(); + + addBorder( bottom, "bottom", style ); + addBorder( left, "left", style ); + addBorder( right, "right", style ); + addBorder( top, "top", style ); + } + + public static void addTableRowProperties( TableRow tableRow, + StringBuilder style ) + { + if ( tableRow.getRowHeight() > 0 ) + { + style.append( "height: " + + ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in; " ); + } + if ( !tableRow.cantSplit() ) + { + style.append( "keep-together: always; " ); + } + } + + public static void setPictureProperties( Picture picture, + Element graphicElement ) + { + final int aspectRatioX = picture.getAspectRatioX(); + final int aspectRatioY = picture.getAspectRatioY(); + + if ( aspectRatioX > 0 ) + { + graphicElement + .setAttribute( "content-width", ( ( picture.getDxaGoal() + * aspectRatioX / 100 ) / TWIPS_PER_PT ) + + "pt" ); + } + else + graphicElement.setAttribute( "content-width", + ( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" ); + + if ( aspectRatioY > 0 ) + graphicElement + .setAttribute( "content-height", ( ( picture.getDyaGoal() + * aspectRatioY / 100 ) / TWIPS_PER_PT ) + + "pt" ); + else + graphicElement.setAttribute( "content-height", + ( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" ); + + if ( aspectRatioX <= 0 || aspectRatioY <= 0 ) + { + graphicElement.setAttribute( "scaling", "uniform" ); + } + else + { + graphicElement.setAttribute( "scaling", "non-uniform" ); + } + + graphicElement.setAttribute( "vertical-align", "text-bottom" ); + + if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0 + || picture.getDyaCropBottom() != 0 + || picture.getDxaCropLeft() != 0 ) + { + int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT; + int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT; + int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT; + int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT; + graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, " + + rectRight + "pt, " + rectBottom + "pt, " + rectLeft + + "pt)" ); + graphicElement.setAttribute( "oveerflow", "hidden" ); + } + } + +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java new file mode 100644 index 0000000000..62cfb999bc --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToExtractorSuite.java @@ -0,0 +1,114 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.StringWriter; +import java.util.Arrays; +import java.util.List; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; +import org.apache.poi.POIDataSamples; +import org.apache.poi.hwpf.HWPFDocumentCore; + +public class TestWordToExtractorSuite +{ + /** + * YK: a quick hack to exclude failing documents from the suite. + */ + private static List failingFiles = Arrays.asList(); + + public static Test suite() + { + TestSuite suite = new TestSuite(); + + File directory = POIDataSamples.getDocumentInstance().getFile( + "../document" ); + for ( final File child : directory.listFiles( new FilenameFilter() + { + public boolean accept( File dir, String name ) + { + return name.endsWith( ".doc" ) && !failingFiles.contains( name ); + } + } ) ) + { + final String name = child.getName(); + + suite.addTest( new TestCase( name + " [FO]" ) + { + public void runTest() throws Exception + { + test( child, false ); + } + } ); + suite.addTest( new TestCase( name + " [HTML]" ) + { + public void runTest() throws Exception + { + test( child, true ); + } + } ); + + } + + return suite; + } + + protected static void test( File child, boolean html ) throws Exception + { + HWPFDocumentCore hwpfDocument; + try + { + hwpfDocument = AbstractWordUtils.loadDoc( child ); + } + catch ( Exception exc ) + { + // unable to parse file -- not WordToFoExtractor fault + return; + } + + WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToFoExtractor.processDocument( hwpfDocument ); + + StringWriter stringWriter = new StringWriter(); + + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); + transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); + transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.transform( + new DOMSource( wordToFoExtractor.getDocument() ), + new StreamResult( stringWriter ) ); + + if ( html ) + transformer.setOutputProperty( OutputKeys.METHOD, "html" ); + + // no exceptions + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractorSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractorSuite.java deleted file mode 100644 index 4844cf69ba..0000000000 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractorSuite.java +++ /dev/null @@ -1,92 +0,0 @@ -package org.apache.poi.hwpf.extractor; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FilenameFilter; -import java.io.StringWriter; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Set; - -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - -import org.apache.poi.EncryptedDocumentException; - -import org.apache.poi.hwpf.OldWordFileFormatException; - -import junit.framework.Test; -import junit.framework.TestCase; -import junit.framework.TestSuite; -import org.apache.poi.POIDataSamples; -import org.apache.poi.hwpf.HWPFDocument; - -public class TestWordToFoExtractorSuite -{ - /** - * YK: a quick hack to exclude failing documents from the suite. - * - * WordToFoExtractor stumbles on Bug33519.doc with a NPE - */ - private static List failingFiles = Arrays.asList("Bug33519.doc"); - - public static Test suite() { - TestSuite suite = new TestSuite(); - - File directory = POIDataSamples.getDocumentInstance().getFile( - "../document"); - for (final File child : directory.listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.endsWith(".doc") && !failingFiles.contains(name); - } - })) { - final String name = child.getName(); - suite.addTest(new TestCase(name) { - public void runTest() throws Exception { - test(child); - } - }); - } - - return suite; - } - - protected static void test( File child ) throws Exception - { - HWPFDocument hwpfDocument; - FileInputStream fileInputStream = new FileInputStream( child ); - try - { - hwpfDocument = new HWPFDocument( fileInputStream ); - } - catch ( Exception exc ) - { - // unable to parse file -- not WordToFoExtractor fault - return; - } - finally - { - fileInputStream.close(); - } - - WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( - DocumentBuilderFactory.newInstance().newDocumentBuilder() - .newDocument() ); - wordToFoExtractor.processDocument( hwpfDocument ); - - StringWriter stringWriter = new StringWriter(); - - Transformer transformer = TransformerFactory.newInstance() - .newTransformer(); - transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); - transformer.transform( - new DOMSource( wordToFoExtractor.getDocument() ), - new StreamResult( stringWriter ) ); - // no exceptions - } -} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java new file mode 100644 index 0000000000..f758e6fe24 --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToHtmlExtractor.java @@ -0,0 +1,95 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.StringWriter; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import junit.framework.TestCase; +import org.apache.poi.POIDataSamples; +import org.apache.poi.hwpf.HWPFDocument; + +/** + * Test cases for {@link WordToFoExtractor} + * + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ +public class TestWordToHtmlExtractor extends TestCase +{ + private static String getHtmlText( final String sampleFileName ) + throws Exception + { + HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples + .getDocumentInstance().openResourceAsStream( sampleFileName ) ); + + WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToHtmlExtractor.processDocument( hwpfDocument ); + + StringWriter stringWriter = new StringWriter(); + + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); + transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); + transformer.setOutputProperty( OutputKeys.METHOD, "html" ); + transformer.transform( + new DOMSource( wordToHtmlExtractor.getDocument() ), + new StreamResult( stringWriter ) ); + + String result = stringWriter.toString(); + return result; + } + + public void testBug46610_2() throws Exception + { + String result = getHtmlText( "Bug46610_2.doc" ); + assertTrue( result + .contains( "012345678911234567892123456789312345678941234567890123456789112345678921234567893123456789412345678" ) ); + } + + public void testEquation() throws Exception + { + String result = getHtmlText( "equation.doc" ); + + assertTrue( result + .contains( "" ) ); + } + + public void testHyperlink() throws Exception + { + String result = getHtmlText( "hyperlink.doc" ); + + assertTrue( result.contains( "" ) ); + assertTrue( result.contains( "Hyperlink text" ) ); + } + + public void testPageref() throws Exception + { + String result = getHtmlText( "pageref.doc" ); + + assertTrue( result.contains( "" ) ); + assertTrue( result.contains( "1" ) ); + } +}