]> source.dussan.org Git - poi.git/commitdiff
51519 -- allow users to ignore or include the <rPh> (phonetic run) element in the...
authorTim Allison <tallison@apache.org>
Wed, 8 Mar 2017 13:41:07 +0000 (13:41 +0000)
committerTim Allison <tallison@apache.org>
Wed, 8 Mar 2017 13:41:07 +0000 (13:41 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1785965 13f79535-47bb-0310-9956-ffa450edef68

src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java
src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java
src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java
test-data/spreadsheet/51519.xlsx [new file with mode: 0644]

index 4de27401dbab44806d12dcc88bdcdcb8acd442a8..47865a86eb8ba1f9c0ea706b4f5da02a19c9f03c 100644 (file)
@@ -18,13 +18,14 @@ package org.apache.poi.xssf.eventusermodel;
 
 import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
 
+import javax.xml.parsers.ParserConfigurationException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
-
-import javax.xml.parsers.ParserConfigurationException;
+import java.util.Map;
 
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
@@ -95,6 +96,12 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
      */
     private List<String> strings;
 
+    /**
+     * Map of phonetic strings (if they exist) indexed
+     * with the integer matching the index in strings
+     */
+    private Map<Integer, String> phoneticStrings;
+
     /**
      * @param pkg The {@link OPCPackage} to use as basis for the shared-strings table.
      * @throws IOException If reading the data from the package fails.
@@ -177,6 +184,22 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
         return strings.get(idx);
     }
 
+    /**
+     * Return the phonetic string at a given index.
+     * Returns <code>null</code> if no phonetic string
+     * exists at that index.
+     * @param idx
+     * @return
+     */
+    public String getPhoneticStringAt(int idx) {
+        //avoid an NPE.  If the parser hasn't
+        //yet hit <sst/> phoneticStrings could be null
+        if (phoneticStrings == null) {
+            return null;
+        }
+        return phoneticStrings.get(idx);
+    }
+
     public List<String> getItems() {
         return strings;
     }
@@ -184,14 +207,16 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
     //// ContentHandler methods ////
 
     private StringBuffer characters;
+    private StringBuffer rphCharacters;
     private boolean tIsOpen;
+    private boolean inRPh;
 
     public void startElement(String uri, String localName, String name,
                              Attributes attributes) throws SAXException {
         if (uri != null && ! uri.equals(NS_SPREADSHEETML)) {
             return;
         }
-        
+
         if ("sst".equals(localName)) {
             String count = attributes.getValue("count");
             if(count != null) this.count = Integer.parseInt(count);
@@ -199,12 +224,15 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
             if(uniqueCount != null) this.uniqueCount = Integer.parseInt(uniqueCount);
 
             this.strings = new ArrayList<String>(this.uniqueCount);
-
+            this.phoneticStrings = new HashMap<Integer, String>();
             characters = new StringBuffer();
+            rphCharacters = new StringBuffer();
         } else if ("si".equals(localName)) {
             characters.setLength(0);
         } else if ("t".equals(localName)) {
             tIsOpen = true;
+        } else if ("rPh".equals(localName)) {
+            inRPh = true;
         }
     }
 
@@ -213,11 +241,17 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
         if (uri != null && ! uri.equals(NS_SPREADSHEETML)) {
             return;
         }
-        
+
         if ("si".equals(localName)) {
             strings.add(characters.toString());
+            if (rphCharacters.length() > 0) {
+                phoneticStrings.put(strings.size()-1, rphCharacters.toString());
+                rphCharacters.setLength(0);
+            }
         } else if ("t".equals(localName)) {
-           tIsOpen = false;
+            tIsOpen = false;
+        } else if ("rPh".equals(localName)) {
+            inRPh = false;
         }
     }
 
@@ -226,8 +260,12 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
      */
     public void characters(char[] ch, int start, int length)
             throws SAXException {
-        if (tIsOpen)
-            characters.append(ch, start, length);
+        if (tIsOpen) {
+            if (inRPh) {
+                rphCharacters.append(ch, start, length);
+            } else {
+                characters.append(ch, start, length);
+            }
+        }
     }
-
 }
index 98b51b13134a45b5399a93017bfedce7b400ee31..060c5a80dd5197fd1d21d4879a6b131da2dce689 100644 (file)
 
 package org.apache.poi.xssf.eventusermodel;
 
-import junit.framework.TestCase;
+import java.io.IOException;
+import java.util.List;
+import java.util.regex.Pattern;
 
+import junit.framework.TestCase;
 import org.apache.poi.POIDataSamples;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
@@ -29,10 +32,6 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.util.List;
-import java.util.regex.Pattern;
-
 /**
  * Tests for {@link org.apache.poi.xssf.eventusermodel.XSSFReader}
  */
@@ -59,7 +58,22 @@ public final class TestReadOnlySharedStringsTable extends TestCase {
         }
 
        }
-    
+
+       public void testPhoneticRuns() throws Exception {
+        OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx"));
+        List<PackagePart> parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml"));
+        assertEquals(1, parts.size());
+
+        ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0));
+        List<String> strings = rtbl.getItems();
+        assertEquals(49, strings.size());
+
+        assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0));
+        assertNull(rtbl.getPhoneticStringAt(0));
+        assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3));
+        assertEquals("\u30CB\u30DB\u30F3", rtbl.getPhoneticStringAt(3));
+    }
+
     public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception {
         XSSFWorkbook wb = new XSSFWorkbook(_ssTests.openResourceAsStream("noSharedStringTable.xlsx"));
         OPCPackage pkg = wb.getPackage();
index 89a4e4047e1760f1d308aa376115305ff2461b1c..d82ac62054168a88167b3179ebd087f4881b22d9 100644 (file)
@@ -22,7 +22,6 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import junit.framework.TestCase;
-
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.hssf.HSSFTestDataSamples;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
@@ -226,4 +225,18 @@ public class TestXSSFExcelExtractor extends TestCase {
                extractor.close();
            }
        }
+
+       public void testPhoneticRuns() throws Exception {
+               XSSFExcelExtractor extractor = getExtractor("51519.xlsx");
+               try {
+                       String text = extractor.getText();
+                       assertTrue(text.contains("\u8C4A\u7530"));
+                       //this shows up only as a phonetic run and should not appear
+                       //in the extracted text
+                       assertFalse(text.contains("\u30CB\u30DB\u30F3"));
+               } finally {
+                       extractor.close();
+               }
+
+       }
 }
diff --git a/test-data/spreadsheet/51519.xlsx b/test-data/spreadsheet/51519.xlsx
new file mode 100644 (file)
index 0000000..ba56dc6
Binary files /dev/null and b/test-data/spreadsheet/51519.xlsx differ