123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.hssf.extractor;
-
- import static org.apache.poi.POITestCase.assertContains;
- import static org.junit.Assert.assertEquals;
- import static org.junit.Assert.assertNotNull;
- import static org.junit.Assert.assertTrue;
- import static org.junit.Assert.fail;
-
- import java.io.ByteArrayOutputStream;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.PrintStream;
- import java.security.Permission;
-
- import org.apache.poi.EmptyFileException;
- import org.apache.poi.EncryptedDocumentException;
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.hssf.HSSFTestDataSamples;
- import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.RecordFormatException;
- import org.junit.Test;
-
- /**
- * Unit tests for the Excel 5/95 and Excel 4 (and older) text
- * extractor
- */
- public final class TestOldExcelExtractor {
- private static OldExcelExtractor createExtractor(String sampleFileName) throws IOException {
- File file = HSSFTestDataSamples.getSampleFile(sampleFileName);
- return new OldExcelExtractor(file);
- }
-
- @Test
- public void testSimpleExcel3() throws IOException {
- try (OldExcelExtractor extractor = createExtractor("testEXCEL_3.xls")) {
-
- // Check we can call getText without error
- String text = extractor.getText();
-
- // Check we find a few words we expect in there
- assertContains(text, "Season beginning August");
- assertContains(text, "USDA");
-
- // Check we find a few numbers we expect in there
- assertContains(text, "347");
- assertContains(text, "228");
-
- // Check we find a few string-literal dates in there
- assertContains(text, "1981/82");
-
- // Check the type
- assertEquals(3, extractor.getBiffVersion());
- assertEquals(0x10, extractor.getFileType());
-
- }
- }
-
-
- @Test
- public void testSimpleExcel3NoReading() throws IOException {
- try (OldExcelExtractor extractor = createExtractor("testEXCEL_3.xls")) {
- assertNotNull(extractor);
- }
- }
-
- @Test
- public void testSimpleExcel4() throws IOException {
- try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {
-
- // Check we can call getText without error
- String text = extractor.getText();
-
- // Check we find a few words we expect in there
- assertContains(text, "Size");
- assertContains(text, "Returns");
-
- // Check we find a few numbers we expect in there
- assertContains(text, "11");
- assertContains(text, "784");
-
- // Check the type
- assertEquals(4, extractor.getBiffVersion());
- assertEquals(0x10, extractor.getFileType());
-
- }
- }
-
- @Test
- public void testSimpleExcel5() throws IOException {
- for (String ver : new String[] {"5", "95"}) {
- try (OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls")) {
-
- // Check we can call getText without error
- String text = extractor.getText();
-
- // Check we find a few words we expect in there
- assertContains(text, "Sample Excel");
- assertContains(text, "Written and saved");
-
- // Check we find a few numbers we expect in there
- assertContains(text, "15");
- assertContains(text, "169");
-
- // Check we got the sheet names (new formats only)
- assertContains(text, "Sheet: Feuil3");
-
- // Check the type
- assertEquals(5, extractor.getBiffVersion());
- assertEquals(0x05, extractor.getFileType());
-
- }
- }
- }
-
- @Test
- public void testStrings() throws IOException {
- try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {
- String text = extractor.getText();
-
- // Simple strings
- assertContains(text, "Table 10 -- Examination Coverage:");
- assertContains(text, "Recommended and Average Recommended Additional Tax After");
- assertContains(text, "Individual income tax returns, total");
-
- // More complicated strings
- assertContains(text, "$100,000 or more");
- assertContains(text, "S corporation returns, Form 1120S [10,15]");
- assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
-
- // Formula based strings
- // TODO Find some then test
- }
- }
-
- @Test
- public void testFormattedNumbersExcel4() throws IOException {
- try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {
- String text = extractor.getText();
-
- // Simple numbers
- assertContains(text, "151");
- assertContains(text, "784");
-
- // Numbers which come from formulas
- assertContains(text, "0.398"); // TODO Rounding
- assertContains(text, "624");
-
- // Formatted numbers
- // TODO
- // assertContains(text, "55,624");
- // assertContains(text, "11,743,477");
- }
- }
-
- @Test
- public void testFormattedNumbersExcel5() throws IOException {
- for (String ver : new String[] {"5", "95"}) {
- try (OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls")) {
- String text = extractor.getText();
-
- // Simple numbers
- assertContains(text, "1");
-
- // Numbers which come from formulas
- assertContains(text, "13");
- assertContains(text, "169");
-
- // Formatted numbers
- // TODO
- // assertContains(text, "100.00%");
- // assertContains(text, "155.00%");
- // assertContains(text, "1,125");
- // assertContains(text, "189,945");
- // assertContains(text, "1,234,500");
- // assertContains(text, "$169.00");
- // assertContains(text, "$1,253.82");
- }
- }
- }
-
- @Test
- public void testFromFile() throws IOException {
- for (String ver : new String[] {"4", "5", "95"}) {
- String filename = "testEXCEL_"+ver+".xls";
- File f = HSSFTestDataSamples.getSampleFile(filename);
-
- try (OldExcelExtractor extractor = new OldExcelExtractor(f)) {
- String text = extractor.getText();
- assertNotNull(text);
- assertTrue(text.length() > 100);
- }
- }
- }
-
- @Test
- public void testFromInputStream() throws IOException {
- for (String ver : new String[] {"4", "5", "95"}) {
- String filename = "testEXCEL_"+ver+".xls";
- File f = HSSFTestDataSamples.getSampleFile(filename);
-
- try (InputStream stream = new FileInputStream(f);
- OldExcelExtractor extractor = new OldExcelExtractor(stream)) {
- String text = extractor.getText();
- assertNotNull(text);
- assertTrue(text.length() > 100);
- }
- }
- }
-
- @Test(expected=OfficeXmlFileException.class)
- public void testOpenInvalidFile1() throws IOException {
- // a file that exists, but is a different format
- createExtractor("WithVariousData.xlsx").close();
- }
-
-
- @Test(expected=RecordFormatException.class)
- public void testOpenInvalidFile2() throws IOException {
- // a completely different type of file
- createExtractor("48936-strings.txt").close();
- }
-
- @Test(expected=FileNotFoundException.class)
- public void testOpenInvalidFile3() throws IOException {
- // a POIFS file which is not a Workbook
- try (InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream("47304.doc")) {
- new OldExcelExtractor(is).close();
- }
- }
-
- @Test(expected=EmptyFileException.class)
- public void testOpenNonExistingFile() throws IOException {
- // a file that exists, but is a different format
- new OldExcelExtractor(new File("notexistingfile.xls")).close();
- }
-
- @Test
- public void testInputStream() throws IOException {
- File file = HSSFTestDataSamples.getSampleFile("testEXCEL_3.xls");
- try (InputStream stream = new FileInputStream(file);
- OldExcelExtractor extractor = new OldExcelExtractor(stream);) {
- String text = extractor.getText();
- assertNotNull(text);
- }
- }
-
- @Test
- public void testInputStreamNPOIHeader() throws IOException {
- //TODO: the worksheet names are currently mangled. They're treated
- //as if UTF-16, but they're just ascii. Need to fix this.
- //Is it possible that the leading 0 byte in the worksheet name is a signal
- //that these worksheet names should be interpreted as ascii/1252?
- File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
- try (InputStream stream = new FileInputStream(file);
- OldExcelExtractor extractor = new OldExcelExtractor(stream)) {
- String text = extractor.getText();
- assertNotNull(text);
- }
- }
-
- @Test
- public void testPOIFSFileSystem() throws IOException {
- File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
- try (POIFSFileSystem fs = new POIFSFileSystem(file);
- OldExcelExtractor extractor = new OldExcelExtractor(fs)){
- String text = extractor.getText();
- assertNotNull(text);
- }
- }
-
- @Test
- public void testDirectoryNode() throws IOException {
- File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
- try (POIFSFileSystem fs = new POIFSFileSystem(file);
- OldExcelExtractor extractor = new OldExcelExtractor(fs.getRoot())) {
- String text = extractor.getText();
- assertNotNull(text);
- }
- }
-
- @Test(expected = FileNotFoundException.class)
- public void testDirectoryNodeInvalidFile() throws IOException {
- File file = POIDataSamples.getDocumentInstance().getFile("test.doc");
- try (POIFSFileSystem fs = new POIFSFileSystem(file);
- OldExcelExtractor extractor = new OldExcelExtractor(fs.getRoot())) {
- fail("Should throw exception here");
- }
- }
-
- @Test(expected = ExitException.class)
- public void testMainUsage() throws IOException {
- PrintStream save = System.err;
- SecurityManager sm = System.getSecurityManager();
- System.setSecurityManager(new NoExitSecurityManager());
- try {
- try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
- PrintStream str = new PrintStream(out, false, "UTF-8");
- System.setErr(str);
- // calls System.exit()
- OldExcelExtractor.main(new String[]{});
- }
- } finally {
- System.setSecurityManager(sm);
- System.setErr(save);
- }
- }
-
- @Test
- public void testMain() throws IOException {
- File file = HSSFTestDataSamples.getSampleFile("testEXCEL_3.xls");
- PrintStream save = System.out;
- try {
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- PrintStream str = new PrintStream(out, false, "UTF-8");
- System.setOut(str);
- OldExcelExtractor.main(new String[] {file.getAbsolutePath()});
- String string = out.toString("UTF-8");
- assertTrue("Had: " + string, string.contains("Table C-13--Lemons"));
- } finally {
- System.setOut(save);
- }
- }
-
- @Test(expected = EncryptedDocumentException.class)
- public void testEncryptionException() throws IOException {
- //test file derives from Common Crawl
- File file = HSSFTestDataSamples.getSampleFile("60284.xls");
-
- try (OldExcelExtractor ex = new OldExcelExtractor(file)) {
- assertEquals(5, ex.getBiffVersion());
- assertEquals(5, ex.getFileType());
- ex.getText();
- }
- }
-
- @Test
- public void testSheetWithNoName() throws IOException {
- File file = HSSFTestDataSamples.getSampleFile("64130.xls");
-
- try (OldExcelExtractor ex = new OldExcelExtractor(file)) {
- assertEquals(5, ex.getBiffVersion());
- assertEquals(5, ex.getFileType());
- assertContains(ex.getText(), "Dawn");
- }
- }
-
- private static class NoExitSecurityManager extends SecurityManager {
- @Override
- public void checkPermission(Permission perm) {
- // allow anything.
- }
- @Override
- public void checkPermission(Permission perm, Object context) {
- // allow anything.
- }
- @Override
- public void checkExit(int status) {
- super.checkExit(status);
- throw new ExitException(status);
- }
- }
-
- private static class ExitException extends SecurityException {
- public final int status;
- public ExitException(int status) {
- super("There is no escape!");
- this.status = status;
- }
- }
- }
|