You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestOldExcelExtractor.java 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.extractor;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.junit.Assert.assertEquals;
  18. import static org.junit.Assert.assertNotNull;
  19. import static org.junit.Assert.assertTrue;
  20. import static org.junit.Assert.fail;
  21. import java.io.ByteArrayOutputStream;
  22. import java.io.File;
  23. import java.io.FileInputStream;
  24. import java.io.FileNotFoundException;
  25. import java.io.IOException;
  26. import java.io.InputStream;
  27. import java.io.PrintStream;
  28. import java.security.Permission;
  29. import org.apache.poi.EmptyFileException;
  30. import org.apache.poi.EncryptedDocumentException;
  31. import org.apache.poi.POIDataSamples;
  32. import org.apache.poi.hssf.HSSFTestDataSamples;
  33. import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
  34. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  35. import org.apache.poi.util.RecordFormatException;
  36. import org.junit.Test;
  37. /**
  38. * Unit tests for the Excel 5/95 and Excel 4 (and older) text
  39. * extractor
  40. */
  41. public final class TestOldExcelExtractor {
  42. private static OldExcelExtractor createExtractor(String sampleFileName) throws IOException {
  43. File file = HSSFTestDataSamples.getSampleFile(sampleFileName);
  44. return new OldExcelExtractor(file);
  45. }
  46. @Test
  47. public void testSimpleExcel3() throws IOException {
  48. try (OldExcelExtractor extractor = createExtractor("testEXCEL_3.xls")) {
  49. // Check we can call getText without error
  50. String text = extractor.getText();
  51. // Check we find a few words we expect in there
  52. assertContains(text, "Season beginning August");
  53. assertContains(text, "USDA");
  54. // Check we find a few numbers we expect in there
  55. assertContains(text, "347");
  56. assertContains(text, "228");
  57. // Check we find a few string-literal dates in there
  58. assertContains(text, "1981/82");
  59. // Check the type
  60. assertEquals(3, extractor.getBiffVersion());
  61. assertEquals(0x10, extractor.getFileType());
  62. }
  63. }
  64. @Test
  65. public void testSimpleExcel3NoReading() throws IOException {
  66. try (OldExcelExtractor extractor = createExtractor("testEXCEL_3.xls")) {
  67. assertNotNull(extractor);
  68. }
  69. }
  70. @Test
  71. public void testSimpleExcel4() throws IOException {
  72. try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {
  73. // Check we can call getText without error
  74. String text = extractor.getText();
  75. // Check we find a few words we expect in there
  76. assertContains(text, "Size");
  77. assertContains(text, "Returns");
  78. // Check we find a few numbers we expect in there
  79. assertContains(text, "11");
  80. assertContains(text, "784");
  81. // Check the type
  82. assertEquals(4, extractor.getBiffVersion());
  83. assertEquals(0x10, extractor.getFileType());
  84. }
  85. }
  86. @Test
  87. public void testSimpleExcel5() throws IOException {
  88. for (String ver : new String[] {"5", "95"}) {
  89. try (OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls")) {
  90. // Check we can call getText without error
  91. String text = extractor.getText();
  92. // Check we find a few words we expect in there
  93. assertContains(text, "Sample Excel");
  94. assertContains(text, "Written and saved");
  95. // Check we find a few numbers we expect in there
  96. assertContains(text, "15");
  97. assertContains(text, "169");
  98. // Check we got the sheet names (new formats only)
  99. assertContains(text, "Sheet: Feuil3");
  100. // Check the type
  101. assertEquals(5, extractor.getBiffVersion());
  102. assertEquals(0x05, extractor.getFileType());
  103. }
  104. }
  105. }
  106. @Test
  107. public void testStrings() throws IOException {
  108. try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {
  109. String text = extractor.getText();
  110. // Simple strings
  111. assertContains(text, "Table 10 -- Examination Coverage:");
  112. assertContains(text, "Recommended and Average Recommended Additional Tax After");
  113. assertContains(text, "Individual income tax returns, total");
  114. // More complicated strings
  115. assertContains(text, "$100,000 or more");
  116. assertContains(text, "S corporation returns, Form 1120S [10,15]");
  117. assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
  118. // Formula based strings
  119. // TODO Find some then test
  120. }
  121. }
  122. @Test
  123. public void testFormattedNumbersExcel4() throws IOException {
  124. try (OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls")) {
  125. String text = extractor.getText();
  126. // Simple numbers
  127. assertContains(text, "151");
  128. assertContains(text, "784");
  129. // Numbers which come from formulas
  130. assertContains(text, "0.398"); // TODO Rounding
  131. assertContains(text, "624");
  132. // Formatted numbers
  133. // TODO
  134. // assertContains(text, "55,624");
  135. // assertContains(text, "11,743,477");
  136. }
  137. }
  138. @Test
  139. public void testFormattedNumbersExcel5() throws IOException {
  140. for (String ver : new String[] {"5", "95"}) {
  141. try (OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls")) {
  142. String text = extractor.getText();
  143. // Simple numbers
  144. assertContains(text, "1");
  145. // Numbers which come from formulas
  146. assertContains(text, "13");
  147. assertContains(text, "169");
  148. // Formatted numbers
  149. // TODO
  150. // assertContains(text, "100.00%");
  151. // assertContains(text, "155.00%");
  152. // assertContains(text, "1,125");
  153. // assertContains(text, "189,945");
  154. // assertContains(text, "1,234,500");
  155. // assertContains(text, "$169.00");
  156. // assertContains(text, "$1,253.82");
  157. }
  158. }
  159. }
  160. @Test
  161. public void testFromFile() throws IOException {
  162. for (String ver : new String[] {"4", "5", "95"}) {
  163. String filename = "testEXCEL_"+ver+".xls";
  164. File f = HSSFTestDataSamples.getSampleFile(filename);
  165. try (OldExcelExtractor extractor = new OldExcelExtractor(f)) {
  166. String text = extractor.getText();
  167. assertNotNull(text);
  168. assertTrue(text.length() > 100);
  169. }
  170. }
  171. }
  172. @Test
  173. public void testFromInputStream() throws IOException {
  174. for (String ver : new String[] {"4", "5", "95"}) {
  175. String filename = "testEXCEL_"+ver+".xls";
  176. File f = HSSFTestDataSamples.getSampleFile(filename);
  177. try (InputStream stream = new FileInputStream(f);
  178. OldExcelExtractor extractor = new OldExcelExtractor(stream)) {
  179. String text = extractor.getText();
  180. assertNotNull(text);
  181. assertTrue(text.length() > 100);
  182. }
  183. }
  184. }
  185. @Test(expected=OfficeXmlFileException.class)
  186. public void testOpenInvalidFile1() throws IOException {
  187. // a file that exists, but is a different format
  188. createExtractor("WithVariousData.xlsx").close();
  189. }
  190. @Test(expected=RecordFormatException.class)
  191. public void testOpenInvalidFile2() throws IOException {
  192. // a completely different type of file
  193. createExtractor("48936-strings.txt").close();
  194. }
  195. @Test(expected=FileNotFoundException.class)
  196. public void testOpenInvalidFile3() throws IOException {
  197. // a POIFS file which is not a Workbook
  198. try (InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream("47304.doc")) {
  199. new OldExcelExtractor(is).close();
  200. }
  201. }
  202. @Test(expected=EmptyFileException.class)
  203. public void testOpenNonExistingFile() throws IOException {
  204. // a file that exists, but is a different format
  205. new OldExcelExtractor(new File("notexistingfile.xls")).close();
  206. }
  207. @Test
  208. public void testInputStream() throws IOException {
  209. File file = HSSFTestDataSamples.getSampleFile("testEXCEL_3.xls");
  210. try (InputStream stream = new FileInputStream(file);
  211. OldExcelExtractor extractor = new OldExcelExtractor(stream);) {
  212. String text = extractor.getText();
  213. assertNotNull(text);
  214. }
  215. }
  216. @Test
  217. public void testInputStreamNPOIHeader() throws IOException {
  218. //TODO: the worksheet names are currently mangled. They're treated
  219. //as if UTF-16, but they're just ascii. Need to fix this.
  220. //Is it possible that the leading 0 byte in the worksheet name is a signal
  221. //that these worksheet names should be interpreted as ascii/1252?
  222. File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
  223. try (InputStream stream = new FileInputStream(file);
  224. OldExcelExtractor extractor = new OldExcelExtractor(stream)) {
  225. String text = extractor.getText();
  226. assertNotNull(text);
  227. }
  228. }
  229. @Test
  230. public void testPOIFSFileSystem() throws IOException {
  231. File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
  232. try (POIFSFileSystem fs = new POIFSFileSystem(file);
  233. OldExcelExtractor extractor = new OldExcelExtractor(fs)){
  234. String text = extractor.getText();
  235. assertNotNull(text);
  236. }
  237. }
  238. @Test
  239. public void testDirectoryNode() throws IOException {
  240. File file = HSSFTestDataSamples.getSampleFile("FormulaRefs.xls");
  241. try (POIFSFileSystem fs = new POIFSFileSystem(file);
  242. OldExcelExtractor extractor = new OldExcelExtractor(fs.getRoot())) {
  243. String text = extractor.getText();
  244. assertNotNull(text);
  245. }
  246. }
  247. @Test(expected = FileNotFoundException.class)
  248. public void testDirectoryNodeInvalidFile() throws IOException {
  249. File file = POIDataSamples.getDocumentInstance().getFile("test.doc");
  250. try (POIFSFileSystem fs = new POIFSFileSystem(file);
  251. OldExcelExtractor extractor = new OldExcelExtractor(fs.getRoot())) {
  252. fail("Should throw exception here");
  253. }
  254. }
  255. @Test(expected = ExitException.class)
  256. public void testMainUsage() throws IOException {
  257. PrintStream save = System.err;
  258. SecurityManager sm = System.getSecurityManager();
  259. System.setSecurityManager(new NoExitSecurityManager());
  260. try {
  261. try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
  262. PrintStream str = new PrintStream(out, false, "UTF-8");
  263. System.setErr(str);
  264. // calls System.exit()
  265. OldExcelExtractor.main(new String[]{});
  266. }
  267. } finally {
  268. System.setSecurityManager(sm);
  269. System.setErr(save);
  270. }
  271. }
  272. @Test
  273. public void testMain() throws IOException {
  274. File file = HSSFTestDataSamples.getSampleFile("testEXCEL_3.xls");
  275. PrintStream save = System.out;
  276. try {
  277. ByteArrayOutputStream out = new ByteArrayOutputStream();
  278. PrintStream str = new PrintStream(out, false, "UTF-8");
  279. System.setOut(str);
  280. OldExcelExtractor.main(new String[] {file.getAbsolutePath()});
  281. String string = out.toString("UTF-8");
  282. assertTrue("Had: " + string, string.contains("Table C-13--Lemons"));
  283. } finally {
  284. System.setOut(save);
  285. }
  286. }
  287. @Test(expected = EncryptedDocumentException.class)
  288. public void testEncryptionException() throws IOException {
  289. //test file derives from Common Crawl
  290. File file = HSSFTestDataSamples.getSampleFile("60284.xls");
  291. try (OldExcelExtractor ex = new OldExcelExtractor(file)) {
  292. assertEquals(5, ex.getBiffVersion());
  293. assertEquals(5, ex.getFileType());
  294. ex.getText();
  295. }
  296. }
  297. @Test
  298. public void testSheetWithNoName() throws IOException {
  299. File file = HSSFTestDataSamples.getSampleFile("64130.xls");
  300. try (OldExcelExtractor ex = new OldExcelExtractor(file)) {
  301. assertEquals(5, ex.getBiffVersion());
  302. assertEquals(5, ex.getFileType());
  303. assertContains(ex.getText(), "Dawn");
  304. }
  305. }
  306. private static class NoExitSecurityManager extends SecurityManager {
  307. @Override
  308. public void checkPermission(Permission perm) {
  309. // allow anything.
  310. }
  311. @Override
  312. public void checkPermission(Permission perm, Object context) {
  313. // allow anything.
  314. }
  315. @Override
  316. public void checkExit(int status) {
  317. super.checkExit(status);
  318. throw new ExitException(status);
  319. }
  320. }
  321. private static class ExitException extends SecurityException {
  322. public final int status;
  323. public ExitException(int status) {
  324. super("There is no escape!");
  325. this.status = status;
  326. }
  327. }
  328. }