You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

XLSX2CSV.java 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.examples.xssf.eventusermodel;
  16. import java.io.File;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import java.io.PrintStream;
  20. import javax.xml.parsers.ParserConfigurationException;
  21. import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  22. import org.apache.poi.openxml4j.opc.OPCPackage;
  23. import org.apache.poi.openxml4j.opc.PackageAccess;
  24. import org.apache.poi.ss.usermodel.DataFormatter;
  25. import org.apache.poi.ss.util.CellAddress;
  26. import org.apache.poi.ss.util.CellReference;
  27. import org.apache.poi.util.XMLHelper;
  28. import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
  29. import org.apache.poi.xssf.eventusermodel.XSSFReader;
  30. import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
  31. import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
  32. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  33. import org.apache.poi.xssf.model.SharedStrings;
  34. import org.apache.poi.xssf.model.Styles;
  35. import org.apache.poi.xssf.model.StylesTable;
  36. import org.apache.poi.xssf.usermodel.XSSFComment;
  37. import org.xml.sax.ContentHandler;
  38. import org.xml.sax.InputSource;
  39. import org.xml.sax.SAXException;
  40. import org.xml.sax.XMLReader;
  41. /**
  42. * A rudimentary XLSX -> CSV processor modeled on the
  43. * POI sample program XLS2CSVmra from the package
  44. * org.apache.poi.hssf.eventusermodel.examples.
  45. * As with the HSSF version, this tries to spot missing
  46. * rows and cells, and output empty entries for them.
  47. * <p>
  48. * Data sheets are read using a SAX parser to keep the
  49. * memory footprint relatively small, so this should be
  50. * able to read enormous workbooks. The styles table and
  51. * the shared-string table must be kept in memory. The
  52. * standard POI styles table class is used, but a custom
  53. * (read-only) class is used for the shared string table
  54. * because the standard POI SharedStringsTable grows very
  55. * quickly with the number of unique strings.
  56. * <p>
  57. * For a more advanced implementation of SAX event parsing
  58. * of XLSX files, see {@link XSSFEventBasedExcelExtractor}
  59. * and {@link XSSFSheetXMLHandler}. Note that for many cases,
  60. * it may be possible to simply use those with a custom
  61. * {@link SheetContentsHandler} and no SAX code needed of
  62. * your own!
  63. */
  64. @SuppressWarnings({"java:S106","java:S4823","java:S1192"})
  65. public class XLSX2CSV {
  66. /**
  67. * Uses the XSSF Event SAX helpers to do most of the work
  68. * of parsing the Sheet XML, and outputs the contents
  69. * as a (basic) CSV.
  70. */
  71. private class SheetToCSV implements SheetContentsHandler {
  72. private boolean firstCellOfRow;
  73. private int currentRow = -1;
  74. private int currentCol = -1;
  75. private void outputMissingRows(int number) {
  76. for (int i=0; i<number; i++) {
  77. for (int j=0; j<minColumns; j++) {
  78. output.append(',');
  79. }
  80. output.append('\n');
  81. }
  82. }
  83. @Override
  84. public void startRow(int rowNum) {
  85. // If there were gaps, output the missing rows
  86. outputMissingRows(rowNum-currentRow-1);
  87. // Prepare for this row
  88. firstCellOfRow = true;
  89. currentRow = rowNum;
  90. currentCol = -1;
  91. }
  92. @Override
  93. public void endRow(int rowNum) {
  94. // Ensure the minimum number of columns
  95. for (int i=currentCol; i<minColumns; i++) {
  96. output.append(',');
  97. }
  98. output.append('\n');
  99. }
  100. @Override
  101. public void cell(String cellReference, String formattedValue,
  102. XSSFComment comment) {
  103. if (firstCellOfRow) {
  104. firstCellOfRow = false;
  105. } else {
  106. output.append(',');
  107. }
  108. // gracefully handle missing CellRef here in a similar way as XSSFCell does
  109. if(cellReference == null) {
  110. cellReference = new CellAddress(currentRow, currentCol).formatAsString();
  111. }
  112. // Did we miss any cells?
  113. int thisCol = (new CellReference(cellReference)).getCol();
  114. int missedCols = thisCol - currentCol - 1;
  115. for (int i=0; i<missedCols; i++) {
  116. output.append(',');
  117. }
  118. // no need to append anything if we do not have a value
  119. if (formattedValue == null) {
  120. return;
  121. }
  122. currentCol = thisCol;
  123. // Number or string?
  124. try {
  125. //noinspection ResultOfMethodCallIgnored
  126. Double.parseDouble(formattedValue);
  127. output.append(formattedValue);
  128. } catch (Exception e) {
  129. // let's remove quotes if they are already there
  130. if (formattedValue.startsWith("\"") && formattedValue.endsWith("\"")) {
  131. formattedValue = formattedValue.substring(1, formattedValue.length()-1);
  132. }
  133. output.append('"');
  134. // encode double-quote with two double-quotes to produce a valid CSV format
  135. output.append(formattedValue.replace("\"", "\"\""));
  136. output.append('"');
  137. }
  138. }
  139. }
  140. ///////////////////////////////////////
  141. private final OPCPackage xlsxPackage;
  142. /**
  143. * Number of columns to read starting with leftmost
  144. */
  145. private final int minColumns;
  146. /**
  147. * Destination for data
  148. */
  149. private final PrintStream output;
  150. /**
  151. * Creates a new XLSX -&gt; CSV converter
  152. *
  153. * @param pkg The XLSX package to process
  154. * @param output The PrintStream to output the CSV to
  155. * @param minColumns The minimum number of columns to output, or -1 for no minimum
  156. */
  157. public XLSX2CSV(OPCPackage pkg, PrintStream output, int minColumns) {
  158. this.xlsxPackage = pkg;
  159. this.output = output;
  160. this.minColumns = minColumns;
  161. }
  162. /**
  163. * Parses and shows the content of one sheet
  164. * using the specified styles and shared-strings tables.
  165. *
  166. * @param styles The table of styles that may be referenced by cells in the sheet
  167. * @param strings The table of strings that may be referenced by cells in the sheet
  168. * @param sheetInputStream The stream to read the sheet-data from.
  169. * @throws java.io.IOException An IO exception from the parser,
  170. * possibly from a byte stream or character stream
  171. * supplied by the application.
  172. * @throws SAXException if parsing the XML data fails.
  173. */
  174. public void processSheet(
  175. Styles styles,
  176. SharedStrings strings,
  177. SheetContentsHandler sheetHandler,
  178. InputStream sheetInputStream) throws IOException, SAXException {
  179. // set emulateCSV=true on DataFormatter - it is also possible to provide a Locale
  180. // when POI 5.2.0 is released, you can call formatter.setUse4DigitYearsInAllDateFormats(true)
  181. // to ensure all dates are formatted with 4 digit years
  182. DataFormatter formatter = new DataFormatter(true);
  183. InputSource sheetSource = new InputSource(sheetInputStream);
  184. try {
  185. XMLReader sheetParser = XMLHelper.newXMLReader();
  186. ContentHandler handler = new XSSFSheetXMLHandler(
  187. styles, null, strings, sheetHandler, formatter, false);
  188. sheetParser.setContentHandler(handler);
  189. sheetParser.parse(sheetSource);
  190. } catch(ParserConfigurationException e) {
  191. throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
  192. }
  193. }
  194. /**
  195. * Initiates the processing of the XLS workbook file to CSV.
  196. *
  197. * @throws IOException If reading the data from the package fails.
  198. * @throws SAXException if parsing the XML data fails.
  199. */
  200. public void process() throws IOException, OpenXML4JException, SAXException {
  201. ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage);
  202. XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
  203. StylesTable styles = xssfReader.getStylesTable();
  204. XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
  205. int index = 0;
  206. while (iter.hasNext()) {
  207. try (InputStream stream = iter.next()) {
  208. String sheetName = iter.getSheetName();
  209. this.output.println();
  210. this.output.println(sheetName + " [index=" + index + "]:");
  211. processSheet(styles, strings, new SheetToCSV(), stream);
  212. }
  213. ++index;
  214. }
  215. }
  216. public static void main(String[] args) throws Exception {
  217. if (args.length < 1) {
  218. System.err.println("Use:");
  219. System.err.println(" XLSX2CSV <xlsx file> [min columns]");
  220. return;
  221. }
  222. File xlsxFile = new File(args[0]);
  223. if (!xlsxFile.exists()) {
  224. System.err.println("Not found or not a file: " + xlsxFile.getPath());
  225. return;
  226. }
  227. int minColumns = -1;
  228. if (args.length >= 2)
  229. minColumns = Integer.parseInt(args[1]);
  230. // The package open is instantaneous, as it should be.
  231. OPCPackage p = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
  232. try {
  233. XLSX2CSV xlsx2csv = new XLSX2CSV(p, System.out, minColumns);
  234. xlsx2csv.process();
  235. } finally {
  236. p.revert();
  237. }
  238. }
  239. }