fix for extraction paragraphs and sections from headers/footers with XWPFWordExtracto...

author Yegor Kozlov <yegor@apache.org>

Mon, 31 Aug 2009 17:02:06 +0000 (17:02 +0000)

committer Yegor Kozlov <yegor@apache.org>

Mon, 31 Aug 2009 17:02:06 +0000 (17:02 +0000)
author Yegor Kozlov <yegor@apache.org>
Mon, 31 Aug 2009 17:02:06 +0000 (17:02 +0000)
committer Yegor Kozlov <yegor@apache.org>
Mon, 31 Aug 2009 17:02:06 +0000 (17:02 +0000)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index a27f2971d27b39879e09826419b066d55d87a29a..0191867715cd3133db9ba520767e3ac3dc193150 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -33,7 +33,8 @@
  
      <changes>
          <release version="3.5-beta7" date="2009-??-??">
-           <action dev="POI-DEVELOPERS" type="fix">47773 - Support for extraction of header / footer images in HWPF</action>
+           <action dev="POI-DEVELOPERS" type="fix">47773 - Fix for extraction paragraphs and sections from headers/footers with XWPFWordExtractor</action>
+           <action dev="POI-DEVELOPERS" type="fix">47727 - Support for extraction of header / footer images in HWPF</action>
             <action dev="POI-DEVELOPERS" type="fix">moved all test data to a top-level directory</action>
             <action dev="POI-DEVELOPERS" type="add">47721 - Added implementation for INDIRECT()</action>
             <action dev="POI-DEVELOPERS" type="add">45583 - Avoid exception when reading ClipboardData packet in OLE property sets</action>
diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java

index 63059653f7e421098246e6b5703172921e86bca4..2c604b60e4cb196208503b531d254b6d2d6abdfd 100644 (file)
--- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
@@ -21,6 +21,7 @@ import java.util.Iterator;
  
  import org.apache.poi.POIXMLDocument;
  import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.POIXMLException;
  import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  import org.apache.poi.openxml4j.opc.OPCPackage;
  import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
@@ -31,6 +32,7 @@ import org.apache.poi.xwpf.usermodel.XWPFDocument;
  import org.apache.poi.xwpf.usermodel.XWPFParagraph;
  import org.apache.poi.xwpf.usermodel.XWPFTable;
  import org.apache.xmlbeans.XmlException;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
  
  /**
   * Helper class to extract text from an OOXML Word file
@@ -72,45 +74,77 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
         public String getText() {
                 StringBuffer text = new StringBuffer();
                 XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
-               
+
                 // Start out with all headers
-               // TODO - put them in where they're needed
-               if(hfPolicy.getFirstPageHeader() != null) {
-                       text.append( hfPolicy.getFirstPageHeader().getText() );
-               }
-               if(hfPolicy.getEvenPageHeader() != null) {
-                       text.append( hfPolicy.getEvenPageHeader().getText() );
-               }
-               if(hfPolicy.getDefaultHeader() != null) {
-                       text.append( hfPolicy.getDefaultHeader().getText() );
-               }
+                extractHeaders(text, hfPolicy);
                 
                 // First up, all our paragraph based text
                 Iterator<XWPFParagraph> i = document.getParagraphsIterator();
                 while(i.hasNext()) {
-                       XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
-                                       new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
-                       text.append(decorator.getText()+"\n");
-               }
+                        XWPFParagraph paragraph = i.next();
+
+
+                        try {
+                                CTSectPr ctSectPr = null;
+                                if (paragraph.getCTP().getPPr()!=null) {
+                                        ctSectPr = paragraph.getCTP().getPPr().getSectPr();
+                                }
+
+                                XWPFHeaderFooterPolicy headerFooterPolicy = null;
+
+                                if (ctSectPr!=null) {
+                                        headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
+
+                                        extractHeaders(text, headerFooterPolicy);
+                                }
+
+                                XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
+                                                new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
+                                text.append(decorator.getText()).append('\n');
+
+                                if (ctSectPr!=null) {
+                                        extractFooters(text, headerFooterPolicy);
+                                }
+                        } catch (IOException e) {
+                                throw new POIXMLException(e);
+                        } catch (XmlException e) {
+                                throw new POIXMLException(e);
+                        }
+                }
  
                 // Then our table based text
                 Iterator<XWPFTable> j = document.getTablesIterator();
                 while(j.hasNext()) {
-                       text.append(j.next().getText()+"\n");
+                        text.append(j.next().getText()).append('\n');
                 }
                 
                 // Finish up with all the footers
-               // TODO - put them in where they're needed
-               if(hfPolicy.getFirstPageFooter() != null) {
-                       text.append( hfPolicy.getFirstPageFooter().getText() );
-               }
-               if(hfPolicy.getEvenPageFooter() != null) {
-                       text.append( hfPolicy.getEvenPageFooter().getText() );
-               }
-               if(hfPolicy.getDefaultFooter() != null) {
-                       text.append( hfPolicy.getDefaultFooter().getText() );
-               }
+                extractFooters(text, hfPolicy);
                 
                 return text.toString();
         }
+
+        private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
+                if(hfPolicy.getFirstPageFooter() != null) {
+                        text.append( hfPolicy.getFirstPageFooter().getText() );
+                }
+                if(hfPolicy.getEvenPageFooter() != null) {
+                        text.append( hfPolicy.getEvenPageFooter().getText() );
+                }
+                if(hfPolicy.getDefaultFooter() != null) {
+                        text.append( hfPolicy.getDefaultFooter().getText() );
+                }
+        }
+
+        private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
+                if(hfPolicy.getFirstPageHeader() != null) {
+                        text.append( hfPolicy.getFirstPageHeader().getText() );
+                }
+                if(hfPolicy.getEvenPageHeader() != null) {
+                        text.append( hfPolicy.getEvenPageHeader().getText() );
+                }
+                if(hfPolicy.getDefaultHeader() != null) {
+                        text.append( hfPolicy.getDefaultHeader().getText() );
+                }
+        }
  }
diff --git a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java

index b6bc5f15eb1441c0576945243881da75033d0242..a0ffb87bf9268e5f2aac9b497f42828ba2b39110 100644 (file)
--- a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java
@@ -83,19 +83,26 @@ public class XWPFHeaderFooterPolicy {
         private XWPFHeader defaultHeader;
         private XWPFFooter defaultFooter;
         
-       
+        /**
+         * Figures out the policy for the given document,
+         *  and creates any header and footer objects
+         *  as required.
+         */
+        public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException {
+                this(doc, doc.getDocument().getBody().getSectPr());
+        }
+
         /**
          * Figures out the policy for the given document,
          *  and creates any header and footer objects
          *  as required.
          */
-       public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException {
+       public XWPFHeaderFooterPolicy(XWPFDocument doc, CTSectPr sectPr) throws IOException, XmlException {
                 // Grab what headers and footers have been defined
                 // For now, we don't care about different ranges, as it
                 //  doesn't seem that .docx properly supports that
                 //  feature of the file format yet
                 this.doc = doc;
-               CTSectPr sectPr = doc.getDocument().getBody().getSectPr();
                 for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) {
                         // Get the header
                         CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i);
diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java

index 29b2a1643b0e97d7e0221a66dcfaab5b195e2e84..a3a4c944f7bed4d8bcf9490ca083e98fd8e99e75 100644 (file)
--- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
@@ -198,4 +198,13 @@ public class TestXWPFWordExtractor extends TestCase {
          assertTrue(extractor.getText().contains("extremely well"));
      }
  
+    public void testParagraphHeader() {
+        XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Headers.docx");
+        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
+
+        assertTrue(extractor.getText().contains("Section 1"));
+        assertTrue(extractor.getText().contains("Section 2"));
+        assertTrue(extractor.getText().contains("Section 3"));
+    }
+
  }
diff --git a/test-data/document/Headers.docx b/test-data/document/Headers.docx

new file mode 100755 (executable)

index 0000000..ab1ec02

Binary files /dev/null and b/test-data/document/Headers.docx differ
author	Yegor Kozlov <yegor@apache.org>
	Mon, 31 Aug 2009 17:02:06 +0000 (17:02 +0000)
committer	Yegor Kozlov <yegor@apache.org>
	Mon, 31 Aug 2009 17:02:06 +0000 (17:02 +0000)
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java		patch \| blob \| history
src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java		patch \| blob \| history
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java		patch \| blob \| history
test-data/document/Headers.docx	[new file with mode: 0755]	patch \| blob