From ead63d1e9cf12436fd534d99a7b8786c0b63e7b1 Mon Sep 17 00:00:00 2001 From: Sergey Vladimirov Date: Thu, 11 Aug 2011 20:50:42 +0000 Subject: [PATCH] simplify API to Word file's part processing, like includint page headers / footers into plain text and HTML git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1156823 13f79535-47bb-0310-9956-ffa450edef68 --- .../hwpf/converter/AbstractWordConverter.java | 28 ++++++-- .../hwpf/converter/WordToHtmlConverter.java | 27 ++++---- .../hwpf/converter/WordToTextConverter.java | 24 ++++--- .../poi/hwpf/extractor/WordExtractor.java | 64 +++++++++---------- 4 files changed, 83 insertions(+), 60 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java index cc104b84aa..6f35f1d6df 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java @@ -132,6 +132,16 @@ public abstract class AbstractWordConverter private PicturesManager picturesManager; + /** + * Special actions that need to be called after processing complete, like + * updating stylesheets or building document notes list. Usually they are + * called once, but it's okay to call them several times. + */ + protected void afterProcess() + { + // by default no such actions needed + } + protected Triplet getCharacterRunTriplet( CharacterRun characterRun ) { Triplet original = new Triplet(); @@ -594,7 +604,17 @@ public abstract class AbstractWordConverter processDocumentInformation( summaryInformation ); } - processDocumentPart( wordDocument, wordDocument.getRange() ); + final Range docRange = wordDocument.getRange(); + + if ( docRange.numSections() == 1 ) + { + processSingleSection( wordDocument, docRange.getSection( 0 ) ); + afterProcess(); + return; + } + + processDocumentPart( wordDocument, docRange ); + afterProcess(); } protected abstract void processDocumentInformation( @@ -603,12 +623,6 @@ public abstract class AbstractWordConverter protected void processDocumentPart( HWPFDocumentCore wordDocument, final Range range ) { - if ( range.numSections() == 1 ) - { - processSingleSection( wordDocument, range.getSection( 0 ) ); - return; - } - for ( int s = 0; s < range.numSections(); s++ ) { processSection( wordDocument, range.getSection( s ), s ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java index e18efbe2af..59e20e1a07 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java @@ -180,6 +180,15 @@ public class WordToHtmlConverter extends AbstractWordConverter this.htmlDocumentFacade = new HtmlDocumentFacade( document ); } + @Override + protected void afterProcess() + { + if ( notes != null ) + htmlDocumentFacade.getBody().appendChild( notes ); + + htmlDocumentFacade.updateStylesheet(); + } + public Document getDocument() { return htmlDocumentFacade.getDocument(); @@ -241,17 +250,6 @@ public class WordToHtmlConverter extends AbstractWordConverter processCharacters( wordDocument, currentTableLevel, range, parent ); } - @Override - public void processDocument( HWPFDocumentCore wordDocument ) - { - super.processDocument( wordDocument ); - - if ( notes != null ) - htmlDocumentFacade.getBody().appendChild( notes ); - - htmlDocumentFacade.updateStylesheet(); - } - @Override protected void processDocumentInformation( SummaryInformation summaryInformation ) @@ -270,6 +268,13 @@ public class WordToHtmlConverter extends AbstractWordConverter .addDescription( summaryInformation.getComments() ); } + @Override + public void processDocumentPart( HWPFDocumentCore wordDocument, Range range ) + { + super.processDocumentPart( wordDocument, range ); + afterProcess(); + } + @Override protected void processDrawnObject( HWPFDocument doc, CharacterRun characterRun, OfficeDrawing officeDrawing, diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java index 7e60f37f56..f3f921f5b7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java @@ -166,6 +166,13 @@ public class WordToTextConverter extends AbstractWordConverter this.textDocumentFacade = new TextDocumentFacade( document ); } + @Override + protected void afterProcess() + { + if ( notes != null ) + textDocumentFacade.getBody().appendChild( notes ); + } + public Document getDocument() { return textDocumentFacade.getDocument(); @@ -208,15 +215,6 @@ public class WordToTextConverter extends AbstractWordConverter processCharacters( wordDocument, currentTableLevel, range, currentBlock ); } - @Override - public void processDocument( HWPFDocumentCore wordDocument ) - { - super.processDocument( wordDocument ); - - if ( notes != null ) - textDocumentFacade.getBody().appendChild( notes ); - } - @Override protected void processDocumentInformation( SummaryInformation summaryInformation ) @@ -241,6 +239,14 @@ public class WordToTextConverter extends AbstractWordConverter } } + @Override + public void processDocumentPart( HWPFDocumentCore wordDocument, + Range range ) + { + super.processDocumentPart( wordDocument, range ); + afterProcess(); + } + @Override protected void processDrawnObject( HWPFDocument doc, CharacterRun characterRun, OfficeDrawing officeDrawing, diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index 8438df4f04..dc06fb9262 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -20,14 +20,10 @@ package org.apache.poi.hwpf.extractor; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Arrays; - -import org.apache.poi.hwpf.converter.WordToTextConverter; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.converter.WordToTextConverter; import org.apache.poi.hwpf.usermodel.HeaderStories; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; @@ -72,6 +68,7 @@ public final class WordExtractor extends POIOLE2TextExtractor * @deprecated Use {@link #WordExtractor(DirectoryNode)} instead */ @Deprecated + @SuppressWarnings( "unused" ) public WordExtractor( DirectoryNode dir, POIFSFileSystem fs ) throws IOException { @@ -290,34 +287,35 @@ public final class WordExtractor extends POIOLE2TextExtractor { try { - final StringWriter stringWriter = new StringWriter(); - @SuppressWarnings( "unused" ) - WordToTextConverter wordToTextConverter = new WordToTextConverter() - { - { - HeaderStories hs = new HeaderStories( doc ); - - if ( hs.getFirstHeaderSubrange() != null ) - processDocumentPart( doc, hs.getFirstHeaderSubrange() ); - if ( hs.getEvenHeaderSubrange() != null ) - processDocumentPart( doc, hs.getEvenHeaderSubrange() ); - if ( hs.getOddHeaderSubrange() != null ) - processDocumentPart( doc, hs.getOddHeaderSubrange() ); - - processDocument( doc ); - processDocumentPart( doc, doc.getMainTextboxRange() ); - - if ( hs.getFirstFooterSubrange() != null ) - processDocumentPart( doc, hs.getFirstFooterSubrange() ); - if ( hs.getEvenFooterSubrange() != null ) - processDocumentPart( doc, hs.getEvenFooterSubrange() ); - if ( hs.getOddFooterSubrange() != null ) - processDocumentPart( doc, hs.getOddFooterSubrange() ); - - stringWriter.append( getText() ); - } - }; - return stringWriter.toString(); + WordToTextConverter wordToTextConverter = new WordToTextConverter(); + + HeaderStories hs = new HeaderStories( doc ); + + if ( hs.getFirstHeaderSubrange() != null ) + wordToTextConverter.processDocumentPart( doc, + hs.getFirstHeaderSubrange() ); + if ( hs.getEvenHeaderSubrange() != null ) + wordToTextConverter.processDocumentPart( doc, + hs.getEvenHeaderSubrange() ); + if ( hs.getOddHeaderSubrange() != null ) + wordToTextConverter.processDocumentPart( doc, + hs.getOddHeaderSubrange() ); + + wordToTextConverter.processDocument( doc ); + wordToTextConverter.processDocumentPart( doc, + doc.getMainTextboxRange() ); + + if ( hs.getFirstFooterSubrange() != null ) + wordToTextConverter.processDocumentPart( doc, + hs.getFirstFooterSubrange() ); + if ( hs.getEvenFooterSubrange() != null ) + wordToTextConverter.processDocumentPart( doc, + hs.getEvenFooterSubrange() ); + if ( hs.getOddFooterSubrange() != null ) + wordToTextConverter.processDocumentPart( doc, + hs.getOddFooterSubrange() ); + + return wordToTextConverter.getText(); } catch ( Exception exc ) { -- 2.39.5