Browse Source

Bug 55966: Include content control text in word extraction also if it is part of a paragraph

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1875802 13f79535-47bb-0310-9956-ffa450edef68
tags/before_ooxml_3rd_edition
Dominik Stadler 4 years ago
parent
commit
da2afc19e2

+ 5
- 4
src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java View File

@@ -90,7 +90,7 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {

/**
* Should we concatenate phonetic runs in extraction. Default is <code>true</code>
* @param concatenatePhoneticRuns
* @param concatenatePhoneticRuns If phonetic runs should be concatenated
*/
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
@@ -138,9 +138,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
extractHeaders(text, headerFooterPolicy);
}


for (IRunElement run : paragraph.getRuns()) {
if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
for (IRunElement run : paragraph.getIRuns()) {
if (run instanceof XWPFSDT) {
text.append(((XWPFSDT) run).getContent().getText());
} else if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
text.append(((XWPFRun)run).text());
} else {
text.append(run);

+ 27
- 10
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java View File

@@ -17,6 +17,16 @@

package org.apache.poi.xwpf.extractor;

import org.apache.poi.util.StringUtil;
import org.apache.poi.xwpf.XWPFTestDataSamples;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;

import java.io.IOException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.apache.poi.POITestCase.assertContains;
import static org.apache.poi.POITestCase.assertEndsWith;
import static org.apache.poi.POITestCase.assertNotContained;
@@ -25,16 +35,6 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.poi.util.StringUtil;
import org.apache.poi.xwpf.XWPFTestDataSamples;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;

/**
* Tests for HXFWordExtractor
*/
@@ -460,4 +460,21 @@ public class TestXWPFWordExtractor {
assertContains(txt, "footer 1");
}
}

@Test
public void bug55966() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55966.docx")) {
String expected = "Content control within a paragraph is here text content from within a paragraph second control with a new\n" +
"line\n" +
"\n" +
"Content control that is the entire paragraph\n";

XWPFWordExtractor extractedDoc = new XWPFWordExtractor(doc);

String actual = extractedDoc.getText();

extractedDoc.close();
assertEquals(expected, actual);
}
}
}

BIN
test-data/document/55966.docx View File


Loading…
Cancel
Save