]> source.dussan.org Git - poi.git/commitdiff
[bug-63575] support capitalized text in XWPFWordExtractor
authorPJ Fanning <fanningpj@apache.org>
Sun, 28 Aug 2022 12:19:08 +0000 (12:19 +0000)
committerPJ Fanning <fanningpj@apache.org>
Sun, 28 Aug 2022 12:19:08 +0000 (12:19 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1903729 13f79535-47bb-0310-9956-ffa450edef68

poi-ooxml/src/main/java/org/apache/poi/xwpf/usermodel/XWPFRun.java
poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
test-data/document/capitalized.docx [new file with mode: 0644]

index 13decacd782b6360ec6a4d4356474794b14101d3..5947125249a604c11d732cbae921c48fe8e23f88 100644 (file)
@@ -27,6 +27,7 @@ import java.math.RoundingMode;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Locale;
 
 import javax.xml.namespace.QName;
 
@@ -35,10 +36,7 @@ import org.apache.poi.ooxml.POIXMLException;
 import org.apache.poi.ooxml.util.DocumentHelper;
 import org.apache.poi.ooxml.util.POIXMLUnits;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.apache.poi.util.HexDump;
-import org.apache.poi.util.Internal;
-import org.apache.poi.util.Removal;
-import org.apache.poi.util.Units;
+import org.apache.poi.util.*;
 import org.apache.poi.wp.usermodel.CharacterRun;
 import org.apache.xmlbeans.*;
 import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
@@ -1381,7 +1379,13 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
             //  come up as instances of CTText, but we don't want them
             //  in the normal text output
             if (!("instrText".equals(node.getLocalName()) && XWPFDocument.NS_OOXML_WP_MAIN.equals(node.getNamespaceURI()))) {
-                text.append(((CTText) o).getStringValue());
+                String textValue = ((CTText) o).getStringValue();
+                if (textValue != null) {
+                    if (isCapitalized() || isSmallCaps()) {
+                        textValue = textValue.toUpperCase(LocaleUtil.getUserLocale());
+                    }
+                    text.append(textValue);
+                }
             }
         }
 
@@ -1391,7 +1395,9 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
             if (ctfldChar.getFldCharType() == STFldCharType.BEGIN) {
                 if (ctfldChar.getFfData() != null) {
                     for (CTFFCheckBox checkBox : ctfldChar.getFfData().getCheckBoxList()) {
-                        text.append((checkBox.getDefault() != null && POIXMLUnits.parseOnOff(checkBox.getDefault().xgetVal())) ? "|X|" : "|_|");
+                        String textValue = checkBox.getDefault() != null && POIXMLUnits.parseOnOff(checkBox.getDefault().xgetVal()) ?
+                                "|X|" : "|_|";
+                        text.append(textValue);
                     }
                 }
             }
index 6e5716549b93ade9c05a0fcbc1a3f8878ad03c2f..12c6bc97b65a627274cb46b48ac3a1a2a3747d5f 100644 (file)
@@ -478,4 +478,13 @@ class TestXWPFWordExtractor {
             assertEquals(expected, actual);
         }
     }
+
+    @Test
+    void testCapitalizedFlag() throws IOException {
+        try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("capitalized.docx");
+             XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) {
+            String txt = extractor.getText();
+            assertEquals( "The following word is: CAPITALIZED.", txt.trim());
+        }
+    }
 }
diff --git a/test-data/document/capitalized.docx b/test-data/document/capitalized.docx
new file mode 100644 (file)
index 0000000..9658e94
Binary files /dev/null and b/test-data/document/capitalized.docx differ