123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.xwpf.extractor;
-
- import java.io.IOException;
- import java.util.List;
-
- import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
- import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
- import org.apache.poi.xwpf.usermodel.IBodyElement;
- import org.apache.poi.xwpf.usermodel.ICell;
- import org.apache.poi.xwpf.usermodel.IRunElement;
- import org.apache.poi.xwpf.usermodel.XWPFDocument;
- import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
- import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
- import org.apache.poi.xwpf.usermodel.XWPFParagraph;
- import org.apache.poi.xwpf.usermodel.XWPFRelation;
- import org.apache.poi.xwpf.usermodel.XWPFRun;
- import org.apache.poi.xwpf.usermodel.XWPFSDT;
- import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
- import org.apache.poi.xwpf.usermodel.XWPFTable;
- import org.apache.poi.xwpf.usermodel.XWPFTableCell;
- import org.apache.poi.xwpf.usermodel.XWPFTableRow;
- import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
-
- /**
- * Helper class to extract text from an OOXML Word file
- */
- public class XWPFWordExtractor implements POIXMLTextExtractor {
- public static final XWPFRelation[] SUPPORTED_TYPES = {
- XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE,
- XWPFRelation.MACRO_DOCUMENT,
- XWPFRelation.MACRO_TEMPLATE_DOCUMENT
- };
-
- private final XWPFDocument document;
- private boolean fetchHyperlinks;
- private boolean concatenatePhoneticRuns = true;
- private boolean doCloseFilesystem = true;
-
- public XWPFWordExtractor(OPCPackage container) throws IOException {
- this(new XWPFDocument(container));
- }
-
- public XWPFWordExtractor(XWPFDocument document) {
- this.document = document;
- }
-
- /**
- * Should we also fetch the hyperlinks, when fetching
- * the text content? Default is to only output the
- * hyperlink label, and not the contents
- */
- public void setFetchHyperlinks(boolean fetch) {
- fetchHyperlinks = fetch;
- }
-
- /**
- * Should we concatenate phonetic runs in extraction. Default is <code>true</code>
- * @param concatenatePhoneticRuns If phonetic runs should be concatenated
- */
- public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
- this.concatenatePhoneticRuns = concatenatePhoneticRuns;
- }
-
- public String getText() {
- StringBuilder text = new StringBuilder(64);
- XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
-
- // Start out with all headers
- extractHeaders(text, hfPolicy);
-
- // Process all body elements
- for (IBodyElement e : document.getBodyElements()) {
- appendBodyElementText(text, e);
- text.append('\n');
- }
-
- // Finish up with all the footers
- extractFooters(text, hfPolicy);
-
- return text.toString();
- }
-
- public void appendBodyElementText(StringBuilder text, IBodyElement e) {
- if (e instanceof XWPFParagraph) {
- appendParagraphText(text, (XWPFParagraph) e);
- } else if (e instanceof XWPFTable) {
- appendTableText(text, (XWPFTable) e);
- } else if (e instanceof XWPFSDT) {
- text.append(((XWPFSDT) e).getContent().getText());
- }
- }
-
- public void appendParagraphText(StringBuilder text, XWPFParagraph paragraph) {
- CTSectPr ctSectPr = null;
- if (paragraph.getCTP().getPPr() != null) {
- ctSectPr = paragraph.getCTP().getPPr().getSectPr();
- }
-
- XWPFHeaderFooterPolicy headerFooterPolicy = null;
-
- if (ctSectPr != null) {
- headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
- extractHeaders(text, headerFooterPolicy);
- }
-
- for (IRunElement run : paragraph.getIRuns()) {
- if (run instanceof XWPFSDT) {
- text.append(((XWPFSDT) run).getContent().getText());
- } else if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
- text.append(((XWPFRun)run).text());
- } else {
- text.append(run);
- }
- if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
- XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document);
- if (link != null)
- text.append(" <").append(link.getURL()).append(">");
- }
- }
-
- // Add comments
- XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null);
- String commentText = decorator.getCommentText();
- if (commentText.length() > 0) {
- text.append(commentText).append('\n');
- }
-
- // Do endnotes and footnotes
- String footnameText = paragraph.getFootnoteText();
- if (footnameText != null && footnameText.length() > 0) {
- text.append(footnameText).append('\n');
- }
-
- if (ctSectPr != null) {
- extractFooters(text, headerFooterPolicy);
- }
- }
-
- private void appendTableText(StringBuilder text, XWPFTable table) {
- //this works recursively to pull embedded tables from tables
- for (XWPFTableRow row : table.getRows()) {
- List<ICell> cells = row.getTableICells();
- for (int i = 0; i < cells.size(); i++) {
- ICell cell = cells.get(i);
- if (cell instanceof XWPFTableCell) {
- text.append(((XWPFTableCell) cell).getTextRecursively());
- } else if (cell instanceof XWPFSDTCell) {
- text.append(((XWPFSDTCell) cell).getContent().getText());
- }
- if (i < cells.size() - 1) {
- text.append("\t");
- }
- }
- text.append('\n');
- }
- }
-
- private void extractFooters(StringBuilder text, XWPFHeaderFooterPolicy hfPolicy) {
- if (hfPolicy == null) return;
-
- if (hfPolicy.getFirstPageFooter() != null) {
- text.append(hfPolicy.getFirstPageFooter().getText());
- }
- if (hfPolicy.getEvenPageFooter() != null) {
- text.append(hfPolicy.getEvenPageFooter().getText());
- }
- if (hfPolicy.getDefaultFooter() != null) {
- text.append(hfPolicy.getDefaultFooter().getText());
- }
- }
-
- private void extractHeaders(StringBuilder text, XWPFHeaderFooterPolicy hfPolicy) {
- if (hfPolicy == null) return;
-
- if (hfPolicy.getFirstPageHeader() != null) {
- text.append(hfPolicy.getFirstPageHeader().getText());
- }
- if (hfPolicy.getEvenPageHeader() != null) {
- text.append(hfPolicy.getEvenPageHeader().getText());
- }
- if (hfPolicy.getDefaultHeader() != null) {
- text.append(hfPolicy.getDefaultHeader().getText());
- }
- }
-
- @Override
- public XWPFDocument getDocument() {
- return document;
- }
-
- @Override
- public void setCloseFilesystem(boolean doCloseFilesystem) {
- this.doCloseFilesystem = doCloseFilesystem;
- }
-
- @Override
- public boolean isCloseFilesystem() {
- return doCloseFilesystem;
- }
-
- @Override
- public XWPFDocument getFilesystem() {
- return document;
- }
- }
|