diff options
author | Dominik Stadler <centic@apache.org> | 2019-01-27 09:57:39 +0000 |
---|---|---|
committer | Dominik Stadler <centic@apache.org> | 2019-01-27 09:57:39 +0000 |
commit | c376d662f92577aa6226e454bb3eb5f113ec817d (patch) | |
tree | 3903acfe8eb5f863bb4fc1dac1218a7e2b177989 /src/testcases | |
parent | e59f9c6461f4e6ee34129970b94c51c4098bd710 (diff) | |
download | poi-c376d662f92577aa6226e454bb3eb5f113ec817d.tar.gz poi-c376d662f92577aa6226e454bb3eb5f113ec817d.zip |
Bug 60405: Add initial support for cetab functions so some macros can be
parsed
Add some function-definitions for Excel 4 Macros and missing functions
found in regression tests
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1852277 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/testcases')
-rw-r--r-- | src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java | 96 | ||||
-rw-r--r-- | src/testcases/org/apache/poi/ss/formula/function/ExcelCetabFunctionExtractor.java | 378 |
2 files changed, 432 insertions, 42 deletions
diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java index fd9c49dc12..ce4f2b7be8 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java @@ -34,7 +34,6 @@ import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.LocaleUtil; -import org.junit.After; import org.junit.Test; /** @@ -130,50 +129,42 @@ public final class TestExcelExtractor { public void testEventExtractor() throws Exception { // First up, a simple file with string // based formulas in it - EventBasedExcelExtractor extractor1 = null; - try { - extractor1 = new EventBasedExcelExtractor( - new POIFSFileSystem( - HSSFTestDataSamples.openSampleFileStream("SimpleWithFormula.xls") - ) - ); - extractor1.setIncludeSheetNames(true); - - String text = extractor1.getText(); - assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", text); - - extractor1.setIncludeSheetNames(false); - extractor1.setFormulasNotResults(true); - - text = extractor1.getText(); - assertEquals("replaceme\nreplaceme\nCONCATENATE(A1,A2)\n", text); - } finally { - if (extractor1 != null) extractor1.close(); + try (EventBasedExcelExtractor extractor1 = new EventBasedExcelExtractor( + new POIFSFileSystem( + HSSFTestDataSamples.openSampleFileStream("SimpleWithFormula.xls") + ) + )) { + extractor1.setIncludeSheetNames(true); + + String text = extractor1.getText(); + assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", text); + + extractor1.setIncludeSheetNames(false); + extractor1.setFormulasNotResults(true); + + text = extractor1.getText(); + assertEquals("replaceme\nreplaceme\nCONCATENATE(A1,A2)\n", text); } // Now, a slightly longer file with numeric formulas - EventBasedExcelExtractor extractor2 = null; - try { - extractor2 = new EventBasedExcelExtractor( - new POIFSFileSystem( - HSSFTestDataSamples.openSampleFileStream("sumifformula.xls") - ) - ); - - extractor2.setIncludeSheetNames(false); - extractor2.setFormulasNotResults(true); - - String text = extractor2.getText(); - assertEquals( - "1000\t1\tSUMIF(A1:A5,\">4000\",B1:B5)\n" + - "2000\t2\n" + - "3000\t3\n" + - "4000\t4\n" + - "5000\t5\n", - text - ); - } finally { - if (extractor2 != null) extractor2.close(); + try (EventBasedExcelExtractor extractor2 = new EventBasedExcelExtractor( + new POIFSFileSystem( + HSSFTestDataSamples.openSampleFileStream("sumifformula.xls") + ) + )) { + + extractor2.setIncludeSheetNames(false); + extractor2.setFormulasNotResults(true); + + String text = extractor2.getText(); + assertEquals( + "1000\t1\tSUMIF(A1:A5,\">4000\",B1:B5)\n" + + "2000\t2\n" + + "3000\t3\n" + + "4000\t4\n" + + "5000\t5\n", + text + ); } } @@ -372,4 +363,25 @@ public final class TestExcelExtractor { assertContains(txt, "NONBUSINESS"); } } + + @Test + public void test60405a() throws IOException { + //bug 61045. File is govdocs1 626534 + try (ExcelExtractor extractor = createExtractor("60405.xls")) { + String txt = extractor.getText(); + assertContains(txt, "Macro1"); + assertContains(txt, "Macro2"); + } + } + + @Test + public void test60405b() throws IOException { + //bug 61045. File is govdocs1 626534 + try (ExcelExtractor extractor = createExtractor("60405.xls")) { + extractor.setFormulasNotResults(true); + String txt = extractor.getText(); + assertContains(txt, "Macro1"); + assertContains(txt, "Macro2"); + } + } } diff --git a/src/testcases/org/apache/poi/ss/formula/function/ExcelCetabFunctionExtractor.java b/src/testcases/org/apache/poi/ss/formula/function/ExcelCetabFunctionExtractor.java new file mode 100644 index 0000000000..db87e07377 --- /dev/null +++ b/src/testcases/org/apache/poi/ss/formula/function/ExcelCetabFunctionExtractor.java @@ -0,0 +1,378 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.ss.formula.function; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * This class is not used during normal POI run-time but is used at development time to generate + * the file 'functionMetadataCetab.txt'. There are more than 300 built-in functions in Excel in + * the Cetab and the intention of this class is to make it easier to maintain the metadata, by extracting + * it from a reliable source. + */ +public final class ExcelCetabFunctionExtractor { + + private static final String SOURCE_DOC_FILE_NAME = "functionMetadataCetab-PDF.txt"; + + /** + * For simplicity, the output file is strictly simple ASCII. + * This method detects any unexpected characters. + */ + /* package */ static boolean isSimpleAscii(char c) { + + if (c>=0x21 && c<=0x7E) { + // everything from '!' to '~' (includes letters, digits, punctuation + return true; + } + // some specific whitespace chars below 0x21: + switch(c) { + case ' ': + case '\t': + case '\r': + case '\n': + return true; + } + return false; + } + + + private static final class FunctionData { + // special characters from the ooo document + private static final int CHAR_ELLIPSIS_8230 = 8230; + private static final int CHAR_NDASH_8211 = 8211; + + private final int _index; + private final boolean _hasFootnote; + private final String _name; + private final int _minParams; + private final int _maxParams; + private final String _returnClass; + private final String _paramClasses; + private final boolean _isVolatile; + + public FunctionData(int funcIx, boolean hasFootnote, String funcName, int minParams, int maxParams, + String returnClass, String paramClasses, boolean isVolatile) { + _index = funcIx; + _hasFootnote = hasFootnote; + _name = funcName; + _minParams = minParams; + _maxParams = maxParams; + _returnClass = convertSpecialChars(returnClass); + _paramClasses = convertSpecialChars(paramClasses); + _isVolatile = isVolatile; + } + private static String convertSpecialChars(String ss) { + StringBuilder sb = new StringBuilder(ss.length() + 4); + for(int i=0; i<ss.length(); i++) { + char c = ss.charAt(i); + if (isSimpleAscii(c)) { + sb.append(c); + continue; + } + switch (c) { + case CHAR_NDASH_8211: + sb.append('-'); + continue; + case CHAR_ELLIPSIS_8230: + sb.append("..."); + continue; + } + throw new RuntimeException("bad char (" + ((int)c) + ") in string '" + ss + "'"); + } + return sb.toString(); + } + public int getIndex() { + return _index; + } + public String getName() { + return _name; + } + public boolean hasFootnote() { + return _hasFootnote; + } + public String formatAsDataLine() { + return _index + "\t" + _name + "\t" + _minParams + "\t" + + _maxParams + "\t" + _returnClass + "\t" + _paramClasses + + "\t" + checkMark(_isVolatile) + "\t" + checkMark(_hasFootnote); + } + private static String checkMark(boolean b) { + return b ? "x" : ""; + } + } + + private static final class FunctionDataCollector { + + private final Map<Integer, FunctionData> _allFunctionsByIndex; + private final Map<String, FunctionData> _allFunctionsByName; + private final Set<Integer> _groupFunctionIndexes; + private final Set<String> _groupFunctionNames; + private final PrintStream _ps; + + public FunctionDataCollector(PrintStream ps) { + _ps = ps; + _allFunctionsByIndex = new HashMap<>(); + _allFunctionsByName = new HashMap<>(); + _groupFunctionIndexes = new HashSet<>(); + _groupFunctionNames = new HashSet<>(); + } + + public void addFunction(int funcIx, boolean hasFootnote, String funcName, int minParams, int maxParams, + String returnClass, String paramClasses, String volatileFlagStr) { + boolean isVolatile = volatileFlagStr.length() > 0; + + Integer funcIxKey = Integer.valueOf(funcIx); + if(!_groupFunctionIndexes.add(funcIxKey)) { + throw new RuntimeException("Duplicate function index (" + funcIx + ")"); + } + if(!_groupFunctionNames.add(funcName)) { + throw new RuntimeException("Duplicate function name '" + funcName + "'"); + } + + checkRedefinedFunction(hasFootnote, funcName, funcIxKey); + FunctionData fd = new FunctionData(funcIx, hasFootnote, funcName, + minParams, maxParams, returnClass, paramClasses, isVolatile); + + _allFunctionsByIndex.put(funcIxKey, fd); + _allFunctionsByName.put(funcName, fd); + } + + /** + * Some extra validation here. + * Any function which changes definition will have a footnote in the source document + */ + private void checkRedefinedFunction(boolean hasNote, String funcName, Integer funcIxKey) { + FunctionData fdPrev; + // check by index + fdPrev = _allFunctionsByIndex.get(funcIxKey); + if(fdPrev != null) { + if(!fdPrev.hasFootnote() || !hasNote) { + throw new RuntimeException("changing function [" + + funcIxKey + "] definition without foot-note"); + } + _allFunctionsByName.remove(fdPrev.getName()); + } + // check by name + fdPrev = _allFunctionsByName.get(funcName); + if(fdPrev != null) { + if(!fdPrev.hasFootnote() || !hasNote) { + throw new RuntimeException("changing function '" + + funcName + "' definition without foot-note"); + } + _allFunctionsByIndex.remove(Integer.valueOf(fdPrev.getIndex())); + } + } + + public void endTableGroup(String headingText) { + Integer[] keys = new Integer[_groupFunctionIndexes.size()]; + _groupFunctionIndexes.toArray(keys); + _groupFunctionIndexes.clear(); + _groupFunctionNames.clear(); + Arrays.sort(keys); + + _ps.println("# " + headingText); + for (Integer key : keys) { + FunctionData fd = _allFunctionsByIndex.get(key); + _ps.println(fd.formatAsDataLine()); + } + } + } + + private static final Pattern ID_MATCH = Pattern.compile("0x([\\dA-F]+)"); + private static final Pattern NAME_MATCH = Pattern.compile("([0-9A-Z.]+)"); + private static final Pattern ID_NAME_MATCH = Pattern.compile("0x([\\dA-F]+)\\s+([0-9A-Z.]+)"); + + private static final Set<String> IGNORED_LINES = new HashSet<>(); + static { + IGNORED_LINES.add("[MS-XLS] — v20141018"); + IGNORED_LINES.add("Excel Binary File Format (.xls) Structure"); + IGNORED_LINES.add("Copyright © 2014 Microsoft Corporation."); + IGNORED_LINES.add("Release: October 30, 2014Value Meaning"); + IGNORED_LINES.add("Release: October 30, 2014Value"); + IGNORED_LINES.add("Meaning"); + } + + private static void extractFunctionData(FunctionDataCollector fdc, InputStream is) throws IOException { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { + + String id = null; + String name = null; + while (true) { + String line = reader.readLine(); + if(line == null) { + break; + } + + if(IGNORED_LINES.contains(line) || line.matches("\\d+ / \\d+")) { + continue; + } + + Matcher idMatcher = ID_MATCH.matcher(line); + boolean foundID = idMatcher.matches(); + Matcher nameMatcher = NAME_MATCH.matcher(line); + boolean foundName = nameMatcher.matches(); + Matcher idAndNameMatcher = ID_NAME_MATCH.matcher(line); + boolean foundIDandName = idAndNameMatcher.matches(); + if(foundID && foundName || + foundName && foundIDandName || + foundID && foundIDandName) { + throw new IllegalStateException("Should not find id and name: " + + foundID + "/" + foundName + "/" + foundIDandName + + ", line: " + line); + } + + if(foundID && id != null) { + throw new IllegalStateException("Found ID, but already had one: " + id + ", line: " + line); + } + if(foundName && name != null) { + throw new IllegalStateException("Found name, but already had one: " + name + ", line: " + line); + } + if(foundIDandName && (name != null || id != null)) { + throw new IllegalStateException("Found name and id, but already had one: id: " + id + ", name: " + name + ", line: " + line); + } + + if(foundID) { + id = idMatcher.group(1); + } else if (foundName) { + name = nameMatcher.group(1); + } else if (foundIDandName) { + id = idAndNameMatcher.group(1); + name = idAndNameMatcher.group(2); + // manual override for one function name which contains lowercase characters + } else if(line.equals("VBAActivate")) { + name = line; + } else if (id == null || name == null) { + throw new IllegalStateException("Found params, but had empty id or name, id: " + id + + ", name: " + name + ", line: " + line); + } else { + System.out.println("Found function " + id + " " + name + " " + line); + fdc.addFunction(Integer.parseInt(id, 16), false, name, 0, 0, + "", "", ""); + + id = null; + name = null; + } + } + } + + fdc.endTableGroup(""); + } + /** + * To be sure that no tricky unicode chars make it through to the output file. + */ + private static final class SimpleAsciiOutputStream extends OutputStream { + + private final OutputStream _os; + + public SimpleAsciiOutputStream(OutputStream os) { + _os = os; + } + + @Override + public void write(int b) throws IOException { + checkByte(b); + _os.write(b); + } + + private static void checkByte(int b) { + if (!isSimpleAscii((char)b)) { + throw new RuntimeException("Encountered char (" + b + ") which was not simple ascii as expected"); + } + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + for (int i = 0; i < len; i++) { + checkByte(b[i + off]); + + } + _os.write(b, off, len); + } + } + + private static void processFile(InputStream input, File outFile) throws IOException { + try (OutputStream os = new SimpleAsciiOutputStream(new FileOutputStream(outFile)); + PrintStream ps = new PrintStream(os, true, "UTF-8")) { + + outputLicenseHeader(ps); + Class<?> genClass = ExcelCetabFunctionExtractor.class; + ps.println("# Created by (" + genClass.getName() + ")"); + // identify the source file + ps.println("# from source file '" + SOURCE_DOC_FILE_NAME + "'"); + ps.println("#"); + ps.println("#Columns: (index, name, minParams, maxParams, returnClass, paramClasses, isVolatile, hasFootnote )"); + ps.println(); + extractFunctionData(new FunctionDataCollector(ps), input); + ps.close(); + + String canonicalOutputFileName = outFile.getCanonicalPath(); + System.out.println("Successfully output to '" + canonicalOutputFileName + "'"); + } + } + + private static void outputLicenseHeader(PrintStream ps) { + String[] lines= { + "Licensed to the Apache Software Foundation (ASF) under one or more", + "contributor license agreements. See the NOTICE file distributed with", + "this work for additional information regarding copyright ownership.", + "The ASF licenses this file to You under the Apache License, Version 2.0", + "(the \"License\"); you may not use this file except in compliance with", + "the License. You may obtain a copy of the License at", + "", + " http://www.apache.org/licenses/LICENSE-2.0", + "", + "Unless required by applicable law or agreed to in writing, software", + "distributed under the License is distributed on an \"AS IS\" BASIS,", + "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.", + "See the License for the specific language governing permissions and", + "limitations under the License.", + }; + for (String line : lines) { + ps.print("# "); + ps.println(line); + } + ps.println(); + } + + public static void main(String[] args) throws IOException { + if(!new File(SOURCE_DOC_FILE_NAME).exists()) { + throw new IllegalStateException("Did not find file " + SOURCE_DOC_FILE_NAME + " in the resources"); + } + + try (InputStream stream = new FileInputStream(SOURCE_DOC_FILE_NAME)) { + File outFile = new File("functionMetadataCetab.txt"); + + processFile(stream, outFile); + } + } +} |