<release version="3.7-SNAPSHOT" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">49189 - Detect w:tab and w:cr entries in XWPF paragraphs, even when the XSD is silly and maps them to CTEmpty</action>
<action dev="POI-DEVELOPERS" type="fix">49273 - Correct handling for Font Character Sets with indicies greater than 127</action>
<action dev="POI-DEVELOPERS" type="add">49334 - Track the ValueRangeRecords of charts in HSSFChart, to allow the basic axis operations</action>
<action dev="POI-DEVELOPERS" type="add">49242 - Track the LinkDataRecords of charts in HSSFChart</action>
import org.apache.xmlbeans.XmlObject;
import org.apache.poi.util.Internal;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdnRef;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc;
if (o instanceof CTPTab) {
+ if (o instanceof CTEmpty) {
+ // Some inline text elements get returned not as
+ // themselves, but as CTEmpty, owing to some odd
+ // definitions around line 5642 of the XSDs
+ String tagName = o.getDomNode().getNodeName();
+ if ("w:tab".equals(tagName)) {
+ text.append("\t");
+ }
+ if ("w:cr".equals(tagName)) {
+ text.append("\n");
+ }
+ }
//got a reference to a footnote
if (o instanceof CTFtnEdnRef) {
CTFtnEdnRef ftn = (CTFtnEdnRef) o;
assertTrue(extractor.getText().contains("(120 "));
+ /**
+ * Test that we handle things like tabs and
+ * carriage returns properly in the text that
+ * we're extracting (bug #49189)
+ */
+ public void testDocTabs() {
+ XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("WithTabs.docx");
+ XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
+ // Check bits
+ assertTrue(extractor.getText().contains("a"));
+ assertTrue(extractor.getText().contains("\t"));
+ assertTrue(extractor.getText().contains("b"));
+ // Now check the first paragraph in total
+ assertTrue(extractor.getText().contains("a\tb\n"));
+ }