123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.hssf.extractor;
-
- import static org.apache.poi.POITestCase.assertContains;
- import static org.apache.poi.POITestCase.assertStartsWith;
- import static org.junit.Assert.assertEquals;
- import static org.junit.Assert.assertNotNull;
- import static org.junit.Assert.assertTrue;
-
- import java.io.File;
- import java.io.IOException;
- import java.util.Locale;
-
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.hssf.HSSFTestDataSamples;
- import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.LocaleUtil;
- import org.junit.After;
- import org.junit.Test;
-
- /**
- *
- */
- public final class TestExcelExtractor {
- // to not affect other tests running in the same JVM
- @After
- public void resetPassword() {
- Biff8EncryptionKey.setCurrentUserPassword(null);
- }
-
- private static ExcelExtractor createExtractor(String sampleFileName) throws IOException {
- File file = HSSFTestDataSamples.getSampleFile(sampleFileName);
- POIFSFileSystem fs = new POIFSFileSystem(file);
- ExcelExtractor extractor = new ExcelExtractor(fs);
- extractor.setFilesystem(fs);
- return extractor;
- }
-
- @Test
- public void testSimple() throws IOException {
- ExcelExtractor extractor = createExtractor("Simple.xls");
-
- try {
- assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText());
-
- // Now turn off sheet names
- extractor.setIncludeSheetNames(false);
- assertEquals("replaceMe\n", extractor.getText());
- } finally {
- extractor.close();
- }
- }
-
- @Test
- public void testNumericFormula() throws IOException {
-
- ExcelExtractor extractor = createExtractor("sumifformula.xls");
-
- assertEquals(
- "Sheet1\n" +
- "1000\t1\t5\n" +
- "2000\t2\n" +
- "3000\t3\n" +
- "4000\t4\n" +
- "5000\t5\n" +
- "Sheet2\nSheet3\n",
- extractor.getText()
- );
-
- extractor.setFormulasNotResults(true);
-
- assertEquals(
- "Sheet1\n" +
- "1000\t1\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
- "2000\t2\n" +
- "3000\t3\n" +
- "4000\t4\n" +
- "5000\t5\n" +
- "Sheet2\nSheet3\n",
- extractor.getText()
- );
-
- extractor.close();
- }
-
- @Test
- public void testwithContinueRecords() throws IOException {
-
- ExcelExtractor extractor = createExtractor("StringContinueRecords.xls");
-
- // Has masses of text
- // Until we fixed bug #41064, this would've
- // failed by now
- assertTrue(extractor.getText().length() > 40960);
-
- extractor.close();
- }
-
- @Test
- public void testStringConcat() throws IOException {
-
- ExcelExtractor extractor = createExtractor("SimpleWithFormula.xls");
-
- // Comes out as NaN if treated as a number
- // And as XYZ if treated as a string
- assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText());
-
- extractor.setFormulasNotResults(true);
-
- assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText());
-
- extractor.close();
- }
-
- @Test
- public void testStringFormula() throws IOException {
-
- ExcelExtractor extractor = createExtractor("StringFormulas.xls");
-
- // Comes out as NaN if treated as a number
- // And as XYZ if treated as a string
- assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText());
-
- extractor.setFormulasNotResults(true);
-
- assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
-
- extractor.close();
- }
-
-
- @Test
- public void testEventExtractor() throws Exception {
- // First up, a simple file with string
- // based formulas in it
- EventBasedExcelExtractor extractor1 = null;
- try {
- extractor1 = new EventBasedExcelExtractor(
- new POIFSFileSystem(
- HSSFTestDataSamples.openSampleFileStream("SimpleWithFormula.xls")
- )
- );
- extractor1.setIncludeSheetNames(true);
-
- String text = extractor1.getText();
- assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", text);
-
- extractor1.setIncludeSheetNames(false);
- extractor1.setFormulasNotResults(true);
-
- text = extractor1.getText();
- assertEquals("replaceme\nreplaceme\nCONCATENATE(A1,A2)\n", text);
- } finally {
- if (extractor1 != null) extractor1.close();
- }
-
- // Now, a slightly longer file with numeric formulas
- EventBasedExcelExtractor extractor2 = null;
- try {
- extractor2 = new EventBasedExcelExtractor(
- new POIFSFileSystem(
- HSSFTestDataSamples.openSampleFileStream("sumifformula.xls")
- )
- );
-
- extractor2.setIncludeSheetNames(false);
- extractor2.setFormulasNotResults(true);
-
- String text = extractor2.getText();
- assertEquals(
- "1000\t1\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
- "2000\t2\n" +
- "3000\t3\n" +
- "4000\t4\n" +
- "5000\t5\n",
- text
- );
- } finally {
- if (extractor2 != null) extractor2.close();
- }
- }
-
- @Test
- public void testWithComments() throws IOException {
- ExcelExtractor extractor = createExtractor("SimpleWithComments.xls");
- extractor.setIncludeSheetNames(false);
-
- // Check without comments
- assertEquals(
- "1\tone\n" +
- "2\ttwo\n" +
- "3\tthree\n",
- extractor.getText()
- );
-
- // Now with
- extractor.setIncludeCellComments(true);
- assertEquals(
- "1\tone Comment by Yegor Kozlov: Yegor Kozlov: first cell\n" +
- "2\ttwo Comment by Yegor Kozlov: Yegor Kozlov: second cell\n" +
- "3\tthree Comment by Yegor Kozlov: Yegor Kozlov: third cell\n",
- extractor.getText()
- );
-
- extractor.close();
- }
-
- @Test
- public void testWithBlank() throws IOException {
- ExcelExtractor extractor = createExtractor("MissingBits.xls");
- String def = extractor.getText();
- extractor.setIncludeBlankCells(true);
- String padded = extractor.getText();
-
- assertStartsWith(def,
- "Sheet1\n" +
- "&[TAB]\t\n" +
- "Hello\n" +
- "11\t23\n"
- );
-
- assertStartsWith(padded,
- "Sheet1\n" +
- "&[TAB]\t\n" +
- "Hello\n" +
- "11\t\t\t23\n"
- );
-
- extractor.close();
- }
-
- @Test
- public void testFormatting() throws Exception {
- Locale userLocale = LocaleUtil.getUserLocale();
- LocaleUtil.setUserLocale(Locale.ROOT);
- try {
- ExcelExtractor extractor = createExtractor("Formatting.xls");
- extractor.setIncludeBlankCells(false);
- extractor.setIncludeSheetNames(false);
- String text = extractor.getText();
-
- // Note - not all the formats in the file
- // actually quite match what they claim to
- // be, as some are auto-local builtins...
-
- assertStartsWith(text, "Dates, all 24th November 2006\n");
- assertContains(text, "yyyy/mm/dd\t2006/11/24\n");
- assertContains(text, "yyyy-mm-dd\t2006-11-24\n");
- assertContains(text, "dd-mm-yy\t24-11-06\n");
-
- assertContains(text, "nn.nn\t10.52\n");
- assertContains(text, "nn.nnn\t10.520\n");
- assertContains(text, "\u00a3nn.nn\t\u00a310.52\n");
- extractor.close();
- } finally {
- LocaleUtil.setUserLocale(userLocale);
- }
- }
-
- /**
- * Embeded in a non-excel file
- */
- @Test
- public void testWithEmbeded() throws Exception {
- POIFSFileSystem fs = null;
-
- HSSFWorkbook wbA = null, wbB = null;
- ExcelExtractor exA = null, exB = null;
-
- try {
- fs = new POIFSFileSystem(POIDataSamples.getDocumentInstance().getFile("word_with_embeded.doc"));
-
- DirectoryNode objPool = (DirectoryNode) fs.getRoot().getEntry("ObjectPool");
- DirectoryNode dirA = (DirectoryNode) objPool.getEntry("_1269427460");
- DirectoryNode dirB = (DirectoryNode) objPool.getEntry("_1269427461");
-
- wbA = new HSSFWorkbook(dirA, fs, true);
- exA = new ExcelExtractor(wbA);
- wbB = new HSSFWorkbook(dirB, fs, true);
- exB = new ExcelExtractor(wbB);
-
- assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
- assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
- assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
- assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
- } finally {
- if (exB != null) exB.close();
- if (wbB != null) wbB.close();
- if (exA != null) exA.close();
- if (wbA != null) wbA.close();
- if (fs != null) fs.close();
- }
- }
-
- /**
- * Excel embeded in excel
- */
- @Test
- public void testWithEmbededInOwn() throws Exception {
- POIDataSamples ssSamples = POIDataSamples.getSpreadSheetInstance();
- POIFSFileSystem fs = null;
- HSSFWorkbook wbA = null, wbB = null;
- ExcelExtractor exA = null, exB = null, ex = null;
-
- try {
- fs = new POIFSFileSystem(ssSamples.getFile("excel_with_embeded.xls"));
-
- DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B5");
- DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B4");
-
- wbA = new HSSFWorkbook(dirA, fs, true);
- wbB = new HSSFWorkbook(dirB, fs, true);
-
- exA = new ExcelExtractor(wbA);
- exB = new ExcelExtractor(wbB);
- assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
- assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
-
- assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
- assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
-
- // And the base file too
- ex = new ExcelExtractor(fs);
- assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n", ex.getText());
- assertEquals("Excel With Embeded", ex.getSummaryInformation().getTitle());
- } finally {
- if (ex != null) ex.close();
- if (exB != null) exB.close();
- if (exA != null) exA.close();
- if (wbB != null) wbB.close();
- if (wbA != null) wbA.close();
- if (fs != null) fs.close();
- }
- }
-
- /**
- * Test that we get text from headers and footers
- */
- @Test
- public void test45538() throws IOException {
- String[] files = {
- "45538_classic_Footer.xls", "45538_form_Footer.xls",
- "45538_classic_Header.xls", "45538_form_Header.xls"
- };
- for (String file : files) {
- ExcelExtractor extractor = createExtractor(file);
- String text = extractor.getText();
- assertContains(file, text, "testdoc");
- assertContains(file, text, "test phrase");
- extractor.close();
- }
- }
-
- @Test
- public void testPassword() throws IOException {
- Biff8EncryptionKey.setCurrentUserPassword("password");
- ExcelExtractor extractor = createExtractor("password.xls");
- String text = extractor.getText();
- Biff8EncryptionKey.setCurrentUserPassword(null);
-
- assertContains(text, "ZIP");
- extractor.close();
- }
-
- @Test
- public void testNullPointerException() throws IOException {
- ExcelExtractor extractor = createExtractor("ar.org.apsme.www_Form%20Inscripcion%20Curso%20NO%20Socios.xls");
- assertNotNull(extractor);
- assertNotNull(extractor.getText());
- extractor.close();
- }
-
- @Test
- public void test61045() throws IOException {
- //bug 61045. File is govdocs1 626534
- ExcelExtractor extractor = createExtractor("61045_govdocs1_626534.xls");
- String txt = extractor.getText();
- assertContains(txt, "NONBUSINESS");
- }
-
- }
|