You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestExcelExtractor.java 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.extractor;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.apache.poi.POITestCase.assertStartsWith;
  18. import static org.junit.Assert.assertEquals;
  19. import static org.junit.Assert.assertNotNull;
  20. import static org.junit.Assert.assertTrue;
  21. import java.io.File;
  22. import java.io.IOException;
  23. import java.util.Locale;
  24. import org.apache.poi.POIDataSamples;
  25. import org.apache.poi.hssf.HSSFTestDataSamples;
  26. import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
  27. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  28. import org.apache.poi.poifs.filesystem.DirectoryNode;
  29. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  30. import org.apache.poi.util.LocaleUtil;
  31. import org.junit.After;
  32. import org.junit.Test;
  33. /**
  34. *
  35. */
  36. public final class TestExcelExtractor {
  37. // to not affect other tests running in the same JVM
  38. @After
  39. public void resetPassword() {
  40. Biff8EncryptionKey.setCurrentUserPassword(null);
  41. }
  42. private static ExcelExtractor createExtractor(String sampleFileName) throws IOException {
  43. File file = HSSFTestDataSamples.getSampleFile(sampleFileName);
  44. POIFSFileSystem fs = new POIFSFileSystem(file);
  45. ExcelExtractor extractor = new ExcelExtractor(fs);
  46. extractor.setFilesystem(fs);
  47. return extractor;
  48. }
  49. @Test
  50. public void testSimple() throws IOException {
  51. ExcelExtractor extractor = createExtractor("Simple.xls");
  52. try {
  53. assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText());
  54. // Now turn off sheet names
  55. extractor.setIncludeSheetNames(false);
  56. assertEquals("replaceMe\n", extractor.getText());
  57. } finally {
  58. extractor.close();
  59. }
  60. }
  61. @Test
  62. public void testNumericFormula() throws IOException {
  63. ExcelExtractor extractor = createExtractor("sumifformula.xls");
  64. assertEquals(
  65. "Sheet1\n" +
  66. "1000\t1\t5\n" +
  67. "2000\t2\n" +
  68. "3000\t3\n" +
  69. "4000\t4\n" +
  70. "5000\t5\n" +
  71. "Sheet2\nSheet3\n",
  72. extractor.getText()
  73. );
  74. extractor.setFormulasNotResults(true);
  75. assertEquals(
  76. "Sheet1\n" +
  77. "1000\t1\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
  78. "2000\t2\n" +
  79. "3000\t3\n" +
  80. "4000\t4\n" +
  81. "5000\t5\n" +
  82. "Sheet2\nSheet3\n",
  83. extractor.getText()
  84. );
  85. extractor.close();
  86. }
  87. @Test
  88. public void testwithContinueRecords() throws IOException {
  89. ExcelExtractor extractor = createExtractor("StringContinueRecords.xls");
  90. // Has masses of text
  91. // Until we fixed bug #41064, this would've
  92. // failed by now
  93. assertTrue(extractor.getText().length() > 40960);
  94. extractor.close();
  95. }
  96. @Test
  97. public void testStringConcat() throws IOException {
  98. ExcelExtractor extractor = createExtractor("SimpleWithFormula.xls");
  99. // Comes out as NaN if treated as a number
  100. // And as XYZ if treated as a string
  101. assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText());
  102. extractor.setFormulasNotResults(true);
  103. assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText());
  104. extractor.close();
  105. }
  106. @Test
  107. public void testStringFormula() throws IOException {
  108. ExcelExtractor extractor = createExtractor("StringFormulas.xls");
  109. // Comes out as NaN if treated as a number
  110. // And as XYZ if treated as a string
  111. assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText());
  112. extractor.setFormulasNotResults(true);
  113. assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
  114. extractor.close();
  115. }
  116. @Test
  117. public void testEventExtractor() throws Exception {
  118. // First up, a simple file with string
  119. // based formulas in it
  120. EventBasedExcelExtractor extractor1 = null;
  121. try {
  122. extractor1 = new EventBasedExcelExtractor(
  123. new POIFSFileSystem(
  124. HSSFTestDataSamples.openSampleFileStream("SimpleWithFormula.xls")
  125. )
  126. );
  127. extractor1.setIncludeSheetNames(true);
  128. String text = extractor1.getText();
  129. assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", text);
  130. extractor1.setIncludeSheetNames(false);
  131. extractor1.setFormulasNotResults(true);
  132. text = extractor1.getText();
  133. assertEquals("replaceme\nreplaceme\nCONCATENATE(A1,A2)\n", text);
  134. } finally {
  135. if (extractor1 != null) extractor1.close();
  136. }
  137. // Now, a slightly longer file with numeric formulas
  138. EventBasedExcelExtractor extractor2 = null;
  139. try {
  140. extractor2 = new EventBasedExcelExtractor(
  141. new POIFSFileSystem(
  142. HSSFTestDataSamples.openSampleFileStream("sumifformula.xls")
  143. )
  144. );
  145. extractor2.setIncludeSheetNames(false);
  146. extractor2.setFormulasNotResults(true);
  147. String text = extractor2.getText();
  148. assertEquals(
  149. "1000\t1\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
  150. "2000\t2\n" +
  151. "3000\t3\n" +
  152. "4000\t4\n" +
  153. "5000\t5\n",
  154. text
  155. );
  156. } finally {
  157. if (extractor2 != null) extractor2.close();
  158. }
  159. }
  160. @Test
  161. public void testWithComments() throws IOException {
  162. ExcelExtractor extractor = createExtractor("SimpleWithComments.xls");
  163. extractor.setIncludeSheetNames(false);
  164. // Check without comments
  165. assertEquals(
  166. "1\tone\n" +
  167. "2\ttwo\n" +
  168. "3\tthree\n",
  169. extractor.getText()
  170. );
  171. // Now with
  172. extractor.setIncludeCellComments(true);
  173. assertEquals(
  174. "1\tone Comment by Yegor Kozlov: Yegor Kozlov: first cell\n" +
  175. "2\ttwo Comment by Yegor Kozlov: Yegor Kozlov: second cell\n" +
  176. "3\tthree Comment by Yegor Kozlov: Yegor Kozlov: third cell\n",
  177. extractor.getText()
  178. );
  179. extractor.close();
  180. }
  181. @Test
  182. public void testWithBlank() throws IOException {
  183. ExcelExtractor extractor = createExtractor("MissingBits.xls");
  184. String def = extractor.getText();
  185. extractor.setIncludeBlankCells(true);
  186. String padded = extractor.getText();
  187. assertStartsWith(def,
  188. "Sheet1\n" +
  189. "&[TAB]\t\n" +
  190. "Hello\n" +
  191. "11\t23\n"
  192. );
  193. assertStartsWith(padded,
  194. "Sheet1\n" +
  195. "&[TAB]\t\n" +
  196. "Hello\n" +
  197. "11\t\t\t23\n"
  198. );
  199. extractor.close();
  200. }
  201. @Test
  202. public void testFormatting() throws Exception {
  203. Locale userLocale = LocaleUtil.getUserLocale();
  204. LocaleUtil.setUserLocale(Locale.ROOT);
  205. try {
  206. ExcelExtractor extractor = createExtractor("Formatting.xls");
  207. extractor.setIncludeBlankCells(false);
  208. extractor.setIncludeSheetNames(false);
  209. String text = extractor.getText();
  210. // Note - not all the formats in the file
  211. // actually quite match what they claim to
  212. // be, as some are auto-local builtins...
  213. assertStartsWith(text, "Dates, all 24th November 2006\n");
  214. assertContains(text, "yyyy/mm/dd\t2006/11/24\n");
  215. assertContains(text, "yyyy-mm-dd\t2006-11-24\n");
  216. assertContains(text, "dd-mm-yy\t24-11-06\n");
  217. assertContains(text, "nn.nn\t10.52\n");
  218. assertContains(text, "nn.nnn\t10.520\n");
  219. assertContains(text, "\u00a3nn.nn\t\u00a310.52\n");
  220. extractor.close();
  221. } finally {
  222. LocaleUtil.setUserLocale(userLocale);
  223. }
  224. }
  225. /**
  226. * Embeded in a non-excel file
  227. */
  228. @Test
  229. public void testWithEmbeded() throws Exception {
  230. POIFSFileSystem fs = null;
  231. HSSFWorkbook wbA = null, wbB = null;
  232. ExcelExtractor exA = null, exB = null;
  233. try {
  234. fs = new POIFSFileSystem(POIDataSamples.getDocumentInstance().getFile("word_with_embeded.doc"));
  235. DirectoryNode objPool = (DirectoryNode) fs.getRoot().getEntry("ObjectPool");
  236. DirectoryNode dirA = (DirectoryNode) objPool.getEntry("_1269427460");
  237. DirectoryNode dirB = (DirectoryNode) objPool.getEntry("_1269427461");
  238. wbA = new HSSFWorkbook(dirA, fs, true);
  239. exA = new ExcelExtractor(wbA);
  240. wbB = new HSSFWorkbook(dirB, fs, true);
  241. exB = new ExcelExtractor(wbB);
  242. assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
  243. assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
  244. assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
  245. assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
  246. } finally {
  247. if (exB != null) exB.close();
  248. if (wbB != null) wbB.close();
  249. if (exA != null) exA.close();
  250. if (wbA != null) wbA.close();
  251. if (fs != null) fs.close();
  252. }
  253. }
  254. /**
  255. * Excel embeded in excel
  256. */
  257. @Test
  258. public void testWithEmbededInOwn() throws Exception {
  259. POIDataSamples ssSamples = POIDataSamples.getSpreadSheetInstance();
  260. POIFSFileSystem fs = null;
  261. HSSFWorkbook wbA = null, wbB = null;
  262. ExcelExtractor exA = null, exB = null, ex = null;
  263. try {
  264. fs = new POIFSFileSystem(ssSamples.getFile("excel_with_embeded.xls"));
  265. DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B5");
  266. DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B4");
  267. wbA = new HSSFWorkbook(dirA, fs, true);
  268. wbB = new HSSFWorkbook(dirB, fs, true);
  269. exA = new ExcelExtractor(wbA);
  270. exB = new ExcelExtractor(wbB);
  271. assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
  272. assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
  273. assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
  274. assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
  275. // And the base file too
  276. ex = new ExcelExtractor(fs);
  277. assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n", ex.getText());
  278. assertEquals("Excel With Embeded", ex.getSummaryInformation().getTitle());
  279. } finally {
  280. if (ex != null) ex.close();
  281. if (exB != null) exB.close();
  282. if (exA != null) exA.close();
  283. if (wbB != null) wbB.close();
  284. if (wbA != null) wbA.close();
  285. if (fs != null) fs.close();
  286. }
  287. }
  288. /**
  289. * Test that we get text from headers and footers
  290. */
  291. @Test
  292. public void test45538() throws IOException {
  293. String[] files = {
  294. "45538_classic_Footer.xls", "45538_form_Footer.xls",
  295. "45538_classic_Header.xls", "45538_form_Header.xls"
  296. };
  297. for (String file : files) {
  298. ExcelExtractor extractor = createExtractor(file);
  299. String text = extractor.getText();
  300. assertContains(file, text, "testdoc");
  301. assertContains(file, text, "test phrase");
  302. extractor.close();
  303. }
  304. }
  305. @Test
  306. public void testPassword() throws IOException {
  307. Biff8EncryptionKey.setCurrentUserPassword("password");
  308. ExcelExtractor extractor = createExtractor("password.xls");
  309. String text = extractor.getText();
  310. Biff8EncryptionKey.setCurrentUserPassword(null);
  311. assertContains(text, "ZIP");
  312. extractor.close();
  313. }
  314. @Test
  315. public void testNullPointerException() throws IOException {
  316. ExcelExtractor extractor = createExtractor("ar.org.apsme.www_Form%20Inscripcion%20Curso%20NO%20Socios.xls");
  317. assertNotNull(extractor);
  318. assertNotNull(extractor.getText());
  319. extractor.close();
  320. }
  321. @Test
  322. public void test61045() throws IOException {
  323. //bug 61045. File is govdocs1 626534
  324. ExcelExtractor extractor = createExtractor("61045_govdocs1_626534.xls");
  325. String txt = extractor.getText();
  326. assertContains(txt, "NONBUSINESS");
  327. }
  328. }