You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestXSSFEventBasedExcelExtractor.java 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.xssf.extractor;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.apache.poi.POITestCase.assertStartsWith;
  18. import static org.apache.poi.POITestCase.assertEndsWith;
  19. import static org.junit.jupiter.api.Assertions.assertEquals;
  20. import static org.junit.jupiter.api.Assertions.assertFalse;
  21. import static org.junit.jupiter.api.Assertions.assertNotNull;
  22. import static org.junit.jupiter.api.Assertions.assertTrue;
  23. import java.util.regex.Matcher;
  24. import java.util.regex.Pattern;
  25. import org.apache.poi.extractor.POITextExtractor;
  26. import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
  27. import org.apache.poi.hssf.HSSFTestDataSamples;
  28. import org.apache.poi.hssf.extractor.ExcelExtractor;
  29. import org.apache.poi.xssf.XSSFTestDataSamples;
  30. import org.junit.jupiter.api.Test;
  31. /**
  32. * Tests for {@link XSSFEventBasedExcelExtractor}
  33. */
  34. class TestXSSFEventBasedExcelExtractor {
  35. protected XSSFEventBasedExcelExtractor getExtractor(String sampleName) throws Exception {
  36. return new XSSFEventBasedExcelExtractor(XSSFTestDataSamples.
  37. openSamplePackage(sampleName));
  38. }
  39. /**
  40. * Get text out of the simple file
  41. */
  42. @Test
  43. void testGetSimpleText() throws Exception {
  44. // a very simple file
  45. XSSFEventBasedExcelExtractor extractor = getExtractor("sample.xlsx");
  46. extractor.getText();
  47. String text = extractor.getText();
  48. assertTrue(text.length() > 0);
  49. // Check sheet names
  50. assertStartsWith(text, "Sheet1");
  51. assertEndsWith(text, "Sheet3\n");
  52. // Now without, will have text
  53. extractor.setIncludeSheetNames(false);
  54. text = extractor.getText();
  55. String CHUNK1 =
  56. "Lorem\t111\n" +
  57. "ipsum\t222\n" +
  58. "dolor\t333\n" +
  59. "sit\t444\n" +
  60. "amet\t555\n" +
  61. "consectetuer\t666\n" +
  62. "adipiscing\t777\n" +
  63. "elit\t888\n" +
  64. "Nunc\t999\n";
  65. String CHUNK2 =
  66. "The quick brown fox jumps over the lazy dog\n" +
  67. "hello, xssf\thello, xssf\n" +
  68. "hello, xssf\thello, xssf\n" +
  69. "hello, xssf\thello, xssf\n" +
  70. "hello, xssf\thello, xssf\n";
  71. assertEquals(
  72. CHUNK1 +
  73. "at\t4995\n" +
  74. CHUNK2
  75. , text);
  76. // Now get formulas not their values
  77. extractor.setFormulasNotResults(true);
  78. text = extractor.getText();
  79. assertEquals(
  80. CHUNK1 +
  81. "at\tSUM(B1:B9)\n" +
  82. CHUNK2, text);
  83. // With sheet names too
  84. extractor.setIncludeSheetNames(true);
  85. text = extractor.getText();
  86. assertEquals(
  87. "Sheet1\n" +
  88. CHUNK1 +
  89. "at\tSUM(B1:B9)\n" +
  90. "rich test\n" +
  91. CHUNK2 +
  92. "Sheet3\n"
  93. , text);
  94. extractor.close();
  95. }
  96. @Test
  97. void testGetComplexText() throws Exception {
  98. // A fairly complex file
  99. XSSFEventBasedExcelExtractor extractor = getExtractor("AverageTaxRates.xlsx");
  100. extractor.getText();
  101. String text = extractor.getText();
  102. assertTrue(text.length() > 0);
  103. // Might not have all formatting it should do!
  104. assertStartsWith(text,
  105. "Avgtxfull\n" +
  106. "(iii) AVERAGE TAX RATES ON ANNUAL"
  107. );
  108. extractor.close();
  109. }
  110. @Test
  111. void testInlineStrings() throws Exception {
  112. XSSFEventBasedExcelExtractor extractor = getExtractor("InlineStrings.xlsx");
  113. extractor.setFormulasNotResults(true);
  114. String text = extractor.getText();
  115. // Numbers
  116. assertContains(text, "43");
  117. assertContains(text, "22");
  118. // Strings
  119. assertContains(text, "ABCDE");
  120. assertContains(text, "Long Text");
  121. // Inline Strings
  122. assertContains(text, "1st Inline String");
  123. assertContains(text, "And More");
  124. // Formulas
  125. assertContains(text, "A2");
  126. assertContains(text, "A5-A$2");
  127. extractor.close();
  128. }
  129. /**
  130. * Test that we return pretty much the same as
  131. * ExcelExtractor does, when we're both passed
  132. * the same file, just saved as xls and xlsx
  133. */
  134. @Test
  135. void testComparedToOLE2() throws Exception {
  136. // A fairly simple file - ooxml
  137. XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("SampleSS.xlsx");
  138. ExcelExtractor ole2Extractor =
  139. new ExcelExtractor(HSSFTestDataSamples.openSampleWorkbook("SampleSS.xls"));
  140. POITextExtractor[] extractors =
  141. new POITextExtractor[] { ooxmlExtractor, ole2Extractor };
  142. for (POITextExtractor extractor : extractors) {
  143. String text = extractor.getText().replaceAll("[\r\t]", "");
  144. assertStartsWith(text, "First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n");
  145. Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
  146. Matcher m = pattern.matcher(text);
  147. assertTrue(m.matches());
  148. }
  149. ole2Extractor.close();
  150. ooxmlExtractor.close();
  151. }
  152. /** Test text extraction from text box using getShapes() */
  153. @Test
  154. void testShapes() throws Exception{
  155. try (XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("WithTextBox.xlsx")) {
  156. String text = ooxmlExtractor.getText();
  157. assertContains(text, "Line 1");
  158. assertContains(text, "Line 2");
  159. assertContains(text, "Line 3");
  160. }
  161. }
  162. /**
  163. * Test that we return the same output for unstyled numbers as the
  164. * non-event-based XSSFExcelExtractor.
  165. */
  166. @Test
  167. void testUnstyledNumbersComparedToNonEventBasedExtractor()
  168. throws Exception {
  169. String expectedOutput = "Sheet1\n99.99\n";
  170. try (XSSFExcelExtractor extractor = new XSSFExcelExtractor(
  171. XSSFTestDataSamples.openSampleWorkbook("56011.xlsx"))) {
  172. assertEquals(expectedOutput, extractor.getText().replace(",", "."));
  173. }
  174. try (XSSFEventBasedExcelExtractor fixture = new XSSFEventBasedExcelExtractor(
  175. XSSFTestDataSamples.openSamplePackage("56011.xlsx"))) {
  176. assertEquals(expectedOutput, fixture.getText().replace(",", "."));
  177. }
  178. }
  179. /**
  180. * Test that we return the same output headers and footers as the
  181. * non-event-based XSSFExcelExtractor.
  182. */
  183. @Test
  184. void testHeadersAndFootersComparedToNonEventBasedExtractor()
  185. throws Exception {
  186. String expectedOutputWithHeadersAndFooters =
  187. "Sheet1\n" +
  188. "&\"Calibri,Regular\"&K000000top left\t&\"Calibri,Regular\"&K000000top center\t&\"Calibri,Regular\"&K000000top right\n" +
  189. "abc\t123\n" +
  190. "&\"Calibri,Regular\"&K000000bottom left\t&\"Calibri,Regular\"&K000000bottom center\t&\"Calibri,Regular\"&K000000bottom right\n";
  191. String expectedOutputWithoutHeadersAndFooters =
  192. "Sheet1\n" +
  193. "abc\t123\n";
  194. try (XSSFExcelExtractor extractor = new XSSFExcelExtractor(
  195. XSSFTestDataSamples.openSampleWorkbook("headerFooterTest.xlsx"))) {
  196. assertEquals(expectedOutputWithHeadersAndFooters, extractor.getText());
  197. extractor.setIncludeHeadersFooters(false);
  198. assertEquals(expectedOutputWithoutHeadersAndFooters, extractor.getText());
  199. }
  200. try (XSSFEventBasedExcelExtractor fixture = new XSSFEventBasedExcelExtractor(
  201. XSSFTestDataSamples.openSamplePackage("headerFooterTest.xlsx"))) {
  202. assertEquals(expectedOutputWithHeadersAndFooters, fixture.getText());
  203. fixture.setIncludeHeadersFooters(false);
  204. assertEquals(expectedOutputWithoutHeadersAndFooters, fixture.getText());
  205. }
  206. }
  207. /**
  208. * Test that XSSFEventBasedExcelExtractor outputs comments when specified.
  209. * The output will contain two improvements over the output from
  210. * XSSFExcelExtractor in that (1) comments from empty cells will be
  211. * outputted, and (2) the author will not be outputted twice.
  212. * <p>
  213. * This test will need to be modified if these improvements are ported to
  214. * XSSFExcelExtractor.
  215. */
  216. @Test
  217. void testCommentsComparedToNonEventBasedExtractor()
  218. throws Exception {
  219. String expectedOutputWithoutComments =
  220. "Sheet1\n" +
  221. "\n" +
  222. "abc\n" +
  223. "\n" +
  224. "123\n" +
  225. "\n" +
  226. "\n" +
  227. "\n";
  228. String nonEventBasedExtractorOutputWithComments =
  229. "Sheet1\n" +
  230. "\n" +
  231. "abc Comment by Shaun Kalley: Shaun Kalley: Comment A2\n" +
  232. "\n" +
  233. "123 Comment by Shaun Kalley: Shaun Kalley: Comment B4\n" +
  234. "\n" +
  235. "\n" +
  236. "\n";
  237. String eventBasedExtractorOutputWithComments =
  238. "Sheet1\n" +
  239. "Comment by Shaun Kalley: Comment A1\tComment by Shaun Kalley: Comment B1\n" +
  240. "abc Comment by Shaun Kalley: Comment A2\tComment by Shaun Kalley: Comment B2\n" +
  241. "Comment by Shaun Kalley: Comment A3\tComment by Shaun Kalley: Comment B3\n" +
  242. "Comment by Shaun Kalley: Comment A4\t123 Comment by Shaun Kalley: Comment B4\n" +
  243. "Comment by Shaun Kalley: Comment A5\tComment by Shaun Kalley: Comment B5\n" +
  244. "Comment by Shaun Kalley: Comment A7\tComment by Shaun Kalley: Comment B7\n" +
  245. "Comment by Shaun Kalley: Comment A8\tComment by Shaun Kalley: Comment B8\n";
  246. try (XSSFExcelExtractor extractor = new XSSFExcelExtractor(
  247. XSSFTestDataSamples.openSampleWorkbook("commentTest.xlsx"))) {
  248. assertEquals(expectedOutputWithoutComments, extractor.getText());
  249. extractor.setIncludeCellComments(true);
  250. assertEquals(nonEventBasedExtractorOutputWithComments, extractor.getText());
  251. }
  252. try (XSSFEventBasedExcelExtractor fixture = new XSSFEventBasedExcelExtractor(
  253. XSSFTestDataSamples.openSamplePackage("commentTest.xlsx"))) {
  254. assertEquals(expectedOutputWithoutComments, fixture.getText());
  255. fixture.setIncludeCellComments(true);
  256. assertEquals(eventBasedExtractorOutputWithComments, fixture.getText());
  257. }
  258. }
  259. @Test
  260. void testFile56278_normal() throws Exception {
  261. // first with normal Text Extractor
  262. try (POIXMLTextExtractor extractor = new XSSFExcelExtractor(
  263. XSSFTestDataSamples.openSampleWorkbook("56278.xlsx"))) {
  264. assertNotNull(extractor.getText());
  265. }
  266. }
  267. @Test
  268. void testFile56278_event() throws Exception {
  269. // then with event based one
  270. try (POIXMLTextExtractor extractor = getExtractor("56278.xlsx")) {
  271. assertNotNull(extractor.getText());
  272. }
  273. }
  274. @Test
  275. void test59021() throws Exception {
  276. XSSFEventBasedExcelExtractor ex =
  277. new XSSFEventBasedExcelExtractor(
  278. XSSFTestDataSamples.openSamplePackage("59021.xlsx"));
  279. String text = ex.getText();
  280. assertContains(text, "Abkhazia - Fixed");
  281. assertContains(text, "10/02/2016");
  282. ex.close();
  283. }
  284. @Test
  285. void test51519() throws Exception {
  286. //default behavior: include phonetic runs
  287. XSSFEventBasedExcelExtractor ex =
  288. new XSSFEventBasedExcelExtractor(
  289. XSSFTestDataSamples.openSamplePackage("51519.xlsx"));
  290. String text = ex.getText();
  291. assertContains(text, "\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3");
  292. ex.close();
  293. //now try turning them off
  294. ex =
  295. new XSSFEventBasedExcelExtractor(
  296. XSSFTestDataSamples.openSamplePackage("51519.xlsx"));
  297. ex.setConcatenatePhoneticRuns(false);
  298. text = ex.getText();
  299. assertFalse(text.contains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3"),
  300. "should not be able to find appended phonetic run");
  301. ex.close();
  302. }
  303. @Test
  304. void test66365() throws Exception {
  305. try (XSSFEventBasedExcelExtractor ex =
  306. new XSSFEventBasedExcelExtractor(
  307. XSSFTestDataSamples.openSamplePackage("66365.xlsx"))) {
  308. String text = ex.getText();
  309. assertContains(text, "Alice\tAlice");
  310. assertContains(text, "Bob\tBob");
  311. }
  312. }
  313. @Test
  314. void test67784() throws Exception {
  315. try (XSSFEventBasedExcelExtractor ex =
  316. new XSSFEventBasedExcelExtractor(
  317. XSSFTestDataSamples.openSamplePackage("bug67784.xlsx"))) {
  318. String text = ex.getText().replace("\r", "");
  319. String[] lines = text.split("\n");
  320. assertEquals("FALSE", lines[2]);
  321. assertEquals("TRUE", lines[3]);
  322. assertEquals("ERROR:#DIV/0!", lines[4]);
  323. }
  324. }
  325. @Test
  326. void test67784Formulas() throws Exception {
  327. try (XSSFEventBasedExcelExtractor ex =
  328. new XSSFEventBasedExcelExtractor(
  329. XSSFTestDataSamples.openSamplePackage("bug67784.xlsx"))) {
  330. ex.setFormulasNotResults(true);
  331. String text = ex.getText().replace("\r", "");
  332. String[] lines = text.split("\n");
  333. assertEquals("(2 > 5)", lines[2]);
  334. assertEquals("(2 < 4)", lines[3]);
  335. assertEquals("10/0", lines[4]);
  336. }
  337. }
  338. }