|
|
@@ -21,6 +21,7 @@ import java.util.Iterator; |
|
|
|
|
|
|
|
import org.apache.poi.POIXMLDocument; |
|
|
|
import org.apache.poi.POIXMLTextExtractor; |
|
|
|
import org.apache.poi.POIXMLException; |
|
|
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; |
|
|
|
import org.apache.poi.openxml4j.opc.OPCPackage; |
|
|
|
import org.apache.poi.xwpf.model.XWPFCommentsDecorator; |
|
|
@@ -31,6 +32,7 @@ import org.apache.poi.xwpf.usermodel.XWPFDocument; |
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph; |
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFTable; |
|
|
|
import org.apache.xmlbeans.XmlException; |
|
|
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; |
|
|
|
|
|
|
|
/** |
|
|
|
* Helper class to extract text from an OOXML Word file |
|
|
@@ -72,45 +74,77 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { |
|
|
|
public String getText() { |
|
|
|
StringBuffer text = new StringBuffer(); |
|
|
|
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); |
|
|
|
|
|
|
|
|
|
|
|
// Start out with all headers |
|
|
|
// TODO - put them in where they're needed |
|
|
|
if(hfPolicy.getFirstPageHeader() != null) { |
|
|
|
text.append( hfPolicy.getFirstPageHeader().getText() ); |
|
|
|
} |
|
|
|
if(hfPolicy.getEvenPageHeader() != null) { |
|
|
|
text.append( hfPolicy.getEvenPageHeader().getText() ); |
|
|
|
} |
|
|
|
if(hfPolicy.getDefaultHeader() != null) { |
|
|
|
text.append( hfPolicy.getDefaultHeader().getText() ); |
|
|
|
} |
|
|
|
extractHeaders(text, hfPolicy); |
|
|
|
|
|
|
|
// First up, all our paragraph based text |
|
|
|
Iterator<XWPFParagraph> i = document.getParagraphsIterator(); |
|
|
|
while(i.hasNext()) { |
|
|
|
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( |
|
|
|
new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks)); |
|
|
|
text.append(decorator.getText()+"\n"); |
|
|
|
} |
|
|
|
XWPFParagraph paragraph = i.next(); |
|
|
|
|
|
|
|
|
|
|
|
try { |
|
|
|
CTSectPr ctSectPr = null; |
|
|
|
if (paragraph.getCTP().getPPr()!=null) { |
|
|
|
ctSectPr = paragraph.getCTP().getPPr().getSectPr(); |
|
|
|
} |
|
|
|
|
|
|
|
XWPFHeaderFooterPolicy headerFooterPolicy = null; |
|
|
|
|
|
|
|
if (ctSectPr!=null) { |
|
|
|
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); |
|
|
|
|
|
|
|
extractHeaders(text, headerFooterPolicy); |
|
|
|
} |
|
|
|
|
|
|
|
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( |
|
|
|
new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks)); |
|
|
|
text.append(decorator.getText()).append('\n'); |
|
|
|
|
|
|
|
if (ctSectPr!=null) { |
|
|
|
extractFooters(text, headerFooterPolicy); |
|
|
|
} |
|
|
|
} catch (IOException e) { |
|
|
|
throw new POIXMLException(e); |
|
|
|
} catch (XmlException e) { |
|
|
|
throw new POIXMLException(e); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// Then our table based text |
|
|
|
Iterator<XWPFTable> j = document.getTablesIterator(); |
|
|
|
while(j.hasNext()) { |
|
|
|
text.append(j.next().getText()+"\n"); |
|
|
|
text.append(j.next().getText()).append('\n'); |
|
|
|
} |
|
|
|
|
|
|
|
// Finish up with all the footers |
|
|
|
// TODO - put them in where they're needed |
|
|
|
if(hfPolicy.getFirstPageFooter() != null) { |
|
|
|
text.append( hfPolicy.getFirstPageFooter().getText() ); |
|
|
|
} |
|
|
|
if(hfPolicy.getEvenPageFooter() != null) { |
|
|
|
text.append( hfPolicy.getEvenPageFooter().getText() ); |
|
|
|
} |
|
|
|
if(hfPolicy.getDefaultFooter() != null) { |
|
|
|
text.append( hfPolicy.getDefaultFooter().getText() ); |
|
|
|
} |
|
|
|
extractFooters(text, hfPolicy); |
|
|
|
|
|
|
|
return text.toString(); |
|
|
|
} |
|
|
|
|
|
|
|
private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { |
|
|
|
if(hfPolicy.getFirstPageFooter() != null) { |
|
|
|
text.append( hfPolicy.getFirstPageFooter().getText() ); |
|
|
|
} |
|
|
|
if(hfPolicy.getEvenPageFooter() != null) { |
|
|
|
text.append( hfPolicy.getEvenPageFooter().getText() ); |
|
|
|
} |
|
|
|
if(hfPolicy.getDefaultFooter() != null) { |
|
|
|
text.append( hfPolicy.getDefaultFooter().getText() ); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { |
|
|
|
if(hfPolicy.getFirstPageHeader() != null) { |
|
|
|
text.append( hfPolicy.getFirstPageHeader().getText() ); |
|
|
|
} |
|
|
|
if(hfPolicy.getEvenPageHeader() != null) { |
|
|
|
text.append( hfPolicy.getEvenPageHeader().getText() ); |
|
|
|
} |
|
|
|
if(hfPolicy.getDefaultHeader() != null) { |
|
|
|
text.append( hfPolicy.getDefaultHeader().getText() ); |
|
|
|
} |
|
|
|
} |
|
|
|
} |