You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ToCSV.java 36KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.examples.ss;
  16. import java.io.BufferedWriter;
  17. import java.io.File;
  18. import java.io.FileInputStream;
  19. import java.io.FileNotFoundException;
  20. import java.io.FilenameFilter;
  21. import java.io.IOException;
  22. import java.nio.charset.StandardCharsets;
  23. import java.nio.file.Files;
  24. import java.util.ArrayList;
  25. import org.apache.logging.log4j.LogManager;
  26. import org.apache.logging.log4j.Logger;
  27. import org.apache.poi.ss.usermodel.Cell;
  28. import org.apache.poi.ss.usermodel.CellType;
  29. import org.apache.poi.ss.usermodel.DataFormatter;
  30. import org.apache.poi.ss.usermodel.FormulaEvaluator;
  31. import org.apache.poi.ss.usermodel.Row;
  32. import org.apache.poi.ss.usermodel.Sheet;
  33. import org.apache.poi.ss.usermodel.Workbook;
  34. import org.apache.poi.ss.usermodel.WorkbookFactory;
  35. /**
  36. * Demonstrates <em>one</em> way to convert an Excel spreadsheet into a CSV
  37. * file. This class makes the following assumptions;
  38. * <ul>
  39. * <li>1. Where the Excel workbook contains more than one worksheet, then a single
  40. * CSV file will contain the data from all of the worksheets.</li>
  41. * <li>2. The data matrix contained in the CSV file will be square. This means that
  42. * the number of fields in each record of the CSV file will match the number
  43. * of cells in the longest row found in the Excel workbook. Any short records
  44. * will be 'padded' with empty fields - an empty field is represented in
  45. * the CSV file in this way - ,,.</li>
  46. * <li>3. Empty fields will represent missing cells.</li>
  47. * <li>4. A record consisting of empty fields will be used to represent an empty row
  48. * in the Excel workbook.</li>
  49. * </ul>
  50. * Therefore, if the worksheet looked like this;
  51. *
  52. * <pre>
  53. * ___________________________________________
  54. * | | | | | |
  55. * | A | B | C | D | E |
  56. * ___|_______|_______|_______|_______|_______|
  57. * | | | | | |
  58. * 1 | 1 | 2 | 3 | 4 | 5 |
  59. * ___|_______|_______|_______|_______|_______|
  60. * | | | | | |
  61. * 2 | | | | | |
  62. * ___|_______|_______|_______|_______|_______|
  63. * | | | | | |
  64. * 3 | | A | | B | |
  65. * ___|_______|_______|_______|_______|_______|
  66. * | | | | | |
  67. * 4 | | | | | Z |
  68. * ___|_______|_______|_______|_______|_______|
  69. * | | | | | |
  70. * 5 | 1,400 | | 250 | | |
  71. * ___|_______|_______|_______|_______|_______|
  72. *
  73. * </pre>
  74. *
  75. * Then, the resulting CSV file will contain the following lines (records);
  76. * <pre>
  77. * 1,2,3,4,5
  78. * ,,,,
  79. * ,A,,B,
  80. * ,,,,Z
  81. * "1,400",,250,,
  82. * </pre><p>
  83. * Typically, the comma is used to separate each of the fields that, together,
  84. * constitute a single record or line within the CSV file. This is not however
  85. * a hard and fast rule and so this class allows the user to determine which
  86. * character is used as the field separator and assumes the comma if none other
  87. * is specified.
  88. * </p><p>
  89. * If a field contains the separator then it will be escaped. If the file should
  90. * obey Excel's CSV formatting rules, then the field will be surrounded with
  91. * speech marks whilst if it should obey UNIX conventions, each occurrence of
  92. * the separator will be preceded by the backslash character.
  93. * </p><p>
  94. * If a field contains an end of line (EOL) character then it too will be
  95. * escaped. If the file should obey Excel's CSV formatting rules then the field
  96. * will again be surrounded by speech marks. On the other hand, if the file
  97. * should follow UNIX conventions then a single backslash will precede the
  98. * EOL character. There is no single applicable standard for UNIX and some
  99. * applications replace the CR with \r and the LF with \n but this class will
  100. * not do so.
  101. * </p><p>
  102. * If the field contains double quotes then that character will be escaped. It
  103. * seems as though UNIX does not define a standard for this whilst Excel does.
  104. * Should the CSV file have to obey Excel's formatting rules then the speech
  105. * mark character will be escaped with a second set of speech marks. Finally, an
  106. * enclosing set of speech marks will also surround the entire field. Thus, if
  107. * the following line of text appeared in a cell - "Hello" he said - it would
  108. * look like this when converted into a field within a CSV file - """Hello"" he
  109. * said".
  110. * </p><p>
  111. * Finally, it is worth noting that talk of CSV 'standards' is really slightly
  112. * misleading as there is no such thing. It may well be that the code in this
  113. * class has to be modified to produce files to suit a specific application
  114. * or requirement.
  115. * </p>
  116. *
  117. * @version 1.00 9th April 2010
  118. * 1.10 13th April 2010 - Added support for processing all Excel
  119. * workbooks in a folder along with the ability
  120. * to specify a field separator character.
  121. * 2.00 14th April 2010 - Added support for embedded characters; the
  122. * field separator, EOL and double quotes or
  123. * speech marks. In addition, gave the client
  124. * the ability to select how these are handled,
  125. * either obeying Excel's or UNIX formatting
  126. * conventions.
  127. */
  128. @SuppressWarnings({"java:S106","java:S4823","java:S1192"})
  129. public class ToCSV {
  130. private static final Logger LOG = LogManager.getLogger(ToCSV.class);
  131. private Workbook workbook;
  132. private ArrayList<ArrayList<String>> csvData;
  133. private int maxRowWidth;
  134. private int formattingConvention;
  135. private DataFormatter formatter;
  136. private FormulaEvaluator evaluator;
  137. private String separator;
  138. private static final String CSV_FILE_EXTENSION = ".csv";
  139. private static final String DEFAULT_SEPARATOR = ",";
  140. /**
  141. * Identifies that the CSV file should obey Excel's formatting conventions
  142. * with regard to escaping certain embedded characters - the field separator,
  143. * speech mark and end of line (EOL) character
  144. */
  145. public static final int EXCEL_STYLE_ESCAPING = 0;
  146. /**
  147. * Identifies that the CSV file should obey UNIX formatting conventions
  148. * with regard to escaping certain embedded characters - the field separator
  149. * and end of line (EOL) character
  150. */
  151. public static final int UNIX_STYLE_ESCAPING = 1;
  152. /**
  153. * Process the contents of a folder, convert the contents of each Excel
  154. * workbook into CSV format and save the resulting file to the specified
  155. * folder using the same name as the original workbook with the .xls or
  156. * .xlsx extension replaced by .csv. This method will ensure that the
  157. * CSV file created contains the comma field separator and that embedded
  158. * characters such as the field separator, the EOL and double quotes are
  159. * escaped in accordance with Excel's convention.
  160. *
  161. * @param strSource An instance of the String class that encapsulates the
  162. * name of and path to either a folder containing those Excel
  163. * workbook(s) or the name of and path to an individual Excel workbook
  164. * that is/are to be converted.
  165. * @param strDestination An instance of the String class encapsulating the
  166. * name of and path to a folder that will contain the resulting CSV
  167. * files.
  168. * @throws java.io.FileNotFoundException Thrown if any file cannot be located
  169. * on the filesystem during processing.
  170. * @throws java.io.IOException Thrown if the filesystem encounters any
  171. * problems during processing.
  172. * @throws java.lang.IllegalArgumentException Thrown if the values passed
  173. * to the strSource parameter refers to a file or folder that does not
  174. * exist or if the value passed to the strDestination paramater refers
  175. * to a folder that does not exist or simply does not refer to a
  176. * folder.
  177. */
  178. public void convertExcelToCSV(String strSource, String strDestination)
  179. throws FileNotFoundException, IOException, IllegalArgumentException {
  180. // Simply chain the call to the overloaded convertExcelToCSV(String,
  181. // String, String, int) method, pass the default separator and ensure
  182. // that certain embedded characters are escaped in accordance with
  183. // Excel's formatting conventions
  184. this.convertExcelToCSV(strSource, strDestination,
  185. ToCSV.DEFAULT_SEPARATOR, ToCSV.EXCEL_STYLE_ESCAPING);
  186. }
  187. /**
  188. * Process the contents of a folder, convert the contents of each Excel
  189. * workbook into CSV format and save the resulting file to the specified
  190. * folder using the same name as the original workbook with the .xls or
  191. * .xlsx extension replaced by .csv. This method allows the client to
  192. * define the field separator but will ensure that embedded characters such
  193. * as the field separator, the EOL and double quotes are escaped in
  194. * accordance with Excel's convention.
  195. *
  196. * @param strSource An instance of the String class that encapsulates the
  197. * name of and path to either a folder containing those Excel
  198. * workbook(s) or the name of and path to an individual Excel workbook
  199. * that is/are to be converted.
  200. * @param strDestination An instance of the String class encapsulating the
  201. * name of and path to a folder that will contain the resulting CSV
  202. * files.
  203. * @param separator An instance of the String class that encapsulates the
  204. * character or characters the client wishes to use as the field
  205. * separator.
  206. * @throws java.io.FileNotFoundException Thrown if any file cannot be located
  207. * on the filesystem during processing.
  208. * @throws java.io.IOException Thrown if the filesystem encounters any
  209. * problems during processing.
  210. * @throws java.lang.IllegalArgumentException Thrown if the values passed
  211. * to the strSource parameter refers to a file or folder that does not
  212. * exist or if the value passed to the strDestination paramater refers
  213. * to a folder that does not exist or simply does not refer to a
  214. * folder.
  215. */
  216. public void convertExcelToCSV(String strSource, String strDestination,
  217. String separator)
  218. throws FileNotFoundException, IOException,
  219. IllegalArgumentException {
  220. // Simply chain the call to the overloaded convertExcelToCSV(String,
  221. // String, String, int) method and ensure that certain embedded
  222. // characters are escaped in accordance with Excel's formatting
  223. // conventions
  224. this.convertExcelToCSV(strSource, strDestination,
  225. separator, ToCSV.EXCEL_STYLE_ESCAPING);
  226. }
  227. /**
  228. * Process the contents of a folder, convert the contents of each Excel
  229. * workbook into CSV format and save the resulting file to the specified
  230. * folder using the same name as the original workbook with the .xls or
  231. * .xlsx extension replaced by .csv
  232. *
  233. * @param strSource An instance of the String class that encapsulates the
  234. * name of and path to either a folder containing those Excel
  235. * workbook(s) or the name of and path to an individual Excel workbook
  236. * that is/are to be converted.
  237. * @param strDestination An instance of the String class encapsulating the name
  238. * of and path to a folder that will contain the resulting CSV files.
  239. * @param formattingConvention A primitive int whose value will determine
  240. * whether certain embedded characters should be escaped in accordance
  241. * with Excel's or UNIX formatting conventions. Two constants are
  242. * defined to support this option; ToCSV.EXCEL_STYLE_ESCAPING and
  243. * ToCSV.UNIX_STYLE_ESCAPING
  244. * @param separator An instance of the String class encapsulating the
  245. * characters or characters that should be used to separate items
  246. * on a line within the CSV file.
  247. * @throws java.io.FileNotFoundException Thrown if any file cannot be located
  248. * on the filesystem during processing.
  249. * @throws java.io.IOException Thrown if the filesystem encounters any
  250. * problems during processing.
  251. * @throws java.lang.IllegalArgumentException Thrown if the values passed
  252. * to the strSource parameter refers to a file or folder that does not
  253. * exist, if the value passed to the strDestination paramater refers
  254. * to a folder that does not exist, if the value passed to the
  255. * strDestination parameter does not refer to a folder or if the
  256. * value passed to the formattingConvention parameter is other than
  257. * one of the values defined by the constants ToCSV.EXCEL_STYLE_ESCAPING
  258. * and ToCSV.UNIX_STYLE_ESCAPING.
  259. */
  260. public void convertExcelToCSV(String strSource, String strDestination,
  261. String separator, int formattingConvention)
  262. throws FileNotFoundException, IOException,
  263. IllegalArgumentException {
  264. // Check that the source file/folder exists.
  265. File source = new File(strSource);
  266. if(!source.exists()) {
  267. throw new IllegalArgumentException("The source for the Excel " +
  268. "file(s) cannot be found at " + source);
  269. }
  270. // Ensure thaat the folder the user has chosen to save the CSV files
  271. // away into firstly exists and secondly is a folder rather than, for
  272. // instance, a data file.
  273. File destination = new File(strDestination);
  274. if(!destination.exists()) {
  275. throw new IllegalArgumentException("The destination directory " + destination + " for the " +
  276. "converted CSV file(s) does not exist.");
  277. }
  278. if(!destination.isDirectory()) {
  279. throw new IllegalArgumentException("The destination " + destination + " for the CSV " +
  280. "file(s) is not a directory/folder.");
  281. }
  282. // Ensure the value passed to the formattingConvention parameter is
  283. // within range.
  284. if(formattingConvention != ToCSV.EXCEL_STYLE_ESCAPING &&
  285. formattingConvention != ToCSV.UNIX_STYLE_ESCAPING) {
  286. throw new IllegalArgumentException("The value passed to the " +
  287. "formattingConvention parameter is out of range: " + formattingConvention + ", expecting one of " +
  288. ToCSV.EXCEL_STYLE_ESCAPING + " or " + ToCSV.UNIX_STYLE_ESCAPING);
  289. }
  290. // Copy the spearator character and formatting convention into local
  291. // variables for use in other methods.
  292. this.separator = separator;
  293. this.formattingConvention = formattingConvention;
  294. // Check to see if the sourceFolder variable holds a reference to
  295. // a file or a folder full of files.
  296. final File[] filesList;
  297. if(source.isDirectory()) {
  298. // Get a list of all of the Excel spreadsheet files (workbooks) in
  299. // the source folder/directory
  300. filesList = source.listFiles(new ExcelFilenameFilter());
  301. }
  302. else {
  303. // Assume that it must be a file handle - although there are other
  304. // options the code should perhaps check - and store the reference
  305. // into the filesList variable.
  306. filesList = new File[]{source};
  307. }
  308. // Step through each of the files in the source folder and for each
  309. // open the workbook, convert its contents to CSV format and then
  310. // save the resulting file away into the folder specified by the
  311. // contents of the destination variable. Note that the name of the
  312. // csv file will be created by taking the name of the Excel file,
  313. // removing the extension and replacing it with .csv. Note that there
  314. // is one drawback with this approach; if the folder holding the files
  315. // contains two workbooks whose names match but one is a binary file
  316. // (.xls) and the other a SpreadsheetML file (.xlsx), then the names
  317. // for both CSV files will be identical and one CSV file will,
  318. // therefore, over-write the other.
  319. if (filesList != null) {
  320. for(File excelFile : filesList) {
  321. // Open the workbook
  322. this.openWorkbook(excelFile);
  323. // Convert its contents into a CSV file
  324. this.convertToCSV();
  325. // Build the name of the csv folder from that of the Excel workbook.
  326. // Simply replace the .xls or .xlsx file extension with .csv
  327. String destinationFilename = excelFile.getName();
  328. destinationFilename = destinationFilename.substring(
  329. 0, destinationFilename.lastIndexOf('.')) +
  330. ToCSV.CSV_FILE_EXTENSION;
  331. // Save the CSV file away using the newly constricted file name
  332. // and to the specified directory.
  333. this.saveCSVFile(new File(destination, destinationFilename));
  334. }
  335. }
  336. }
  337. /**
  338. * Open an Excel workbook ready for conversion.
  339. *
  340. * @param file An instance of the File class that encapsulates a handle
  341. * to a valid Excel workbook. Note that the workbook can be in
  342. * either binary (.xls) or SpreadsheetML (.xlsx) format.
  343. * @throws java.io.FileNotFoundException Thrown if the file cannot be located.
  344. * @throws java.io.IOException Thrown if a problem occurs in the file system.
  345. */
  346. private void openWorkbook(File file) throws FileNotFoundException, IOException {
  347. System.out.println("Opening workbook [" + file.getName() + "]");
  348. try (FileInputStream fis = new FileInputStream(file)) {
  349. // Open the workbook and then create the FormulaEvaluator and
  350. // DataFormatter instances that will be needed to, respectively,
  351. // force evaluation of forumlae found in cells and create a
  352. // formatted String encapsulating the cells contents.
  353. this.workbook = WorkbookFactory.create(fis);
  354. this.evaluator = this.workbook.getCreationHelper().createFormulaEvaluator();
  355. this.formatter = new DataFormatter(true);
  356. }
  357. }
  358. /**
  359. * Called to convert the contents of the currently opened workbook into
  360. * a CSV file.
  361. */
  362. private void convertToCSV() {
  363. Sheet sheet;
  364. Row row;
  365. int lastRowNum;
  366. this.csvData = new ArrayList<>();
  367. System.out.println("Converting files contents to CSV format.");
  368. // Discover how many sheets there are in the workbook....
  369. int numSheets = this.workbook.getNumberOfSheets();
  370. // and then iterate through them.
  371. for(int i = 0; i < numSheets; i++) {
  372. // Get a reference to a sheet and check to see if it contains
  373. // any rows.
  374. sheet = this.workbook.getSheetAt(i);
  375. if(sheet.getPhysicalNumberOfRows() > 0) {
  376. // Note down the index number of the bottom-most row and
  377. // then iterate through all of the rows on the sheet starting
  378. // from the very first row - number 1 - even if it is missing.
  379. // Recover a reference to the row and then call another method
  380. // which will strip the data from the cells and build lines
  381. // for inclusion in the resylting CSV file.
  382. lastRowNum = sheet.getLastRowNum();
  383. for(int j = 0; j <= lastRowNum; j++) {
  384. row = sheet.getRow(j);
  385. this.rowToCSV(row);
  386. }
  387. }
  388. }
  389. }
  390. /**
  391. * Called to actually save the data recovered from the Excel workbook
  392. * as a CSV file.
  393. *
  394. * @param file An instance of the File class that encapsulates a handle
  395. * referring to the CSV file.
  396. * @throws java.io.IOException Thrown to indicate and error occurred in the
  397. * underylying file system.
  398. */
  399. private void saveCSVFile(File file) throws IOException {
  400. ArrayList<String> line;
  401. StringBuilder buffer;
  402. String csvLineElement;
  403. // Open a writer onto the CSV file.
  404. try (BufferedWriter bw = Files.newBufferedWriter(file.toPath(), StandardCharsets.ISO_8859_1)) {
  405. System.out.println("Saving the CSV file [" + file.getName() + "]");
  406. // Step through the elements of the ArrayList that was used to hold
  407. // all of the data recovered from the Excel workbooks' sheets, rows
  408. // and cells.
  409. for(int i = 0; i < this.csvData.size(); i++) {
  410. buffer = new StringBuilder();
  411. // Get an element from the ArrayList that contains the data for
  412. // the workbook. This element will itself be an ArrayList
  413. // containing Strings and each String will hold the data recovered
  414. // from a single cell. The for() loop is used to recover elements
  415. // from this 'row' ArrayList one at a time and to write the Strings
  416. // away to a StringBuilder thus assembling a single line for inclusion
  417. // in the CSV file. If a row was empty or if it was short, then
  418. // the ArrayList that contains its data will also be shorter than
  419. // some of the others. Therefore, it is necessary to check within
  420. // the for loop to ensure that the ArrayList contains data to be
  421. // processed. If it does, then an element will be recovered and
  422. // appended to the StringBuilder.
  423. line = this.csvData.get(i);
  424. for(int j = 0; j < this.maxRowWidth; j++) {
  425. if(line.size() > j) {
  426. csvLineElement = line.get(j);
  427. if(csvLineElement != null) {
  428. buffer.append(this.escapeEmbeddedCharacters(
  429. csvLineElement));
  430. }
  431. }
  432. if(j < (this.maxRowWidth - 1)) {
  433. buffer.append(this.separator);
  434. }
  435. }
  436. // Once the line is built, write it away to the CSV file.
  437. bw.write(buffer.toString().trim());
  438. // Condition the inclusion of new line characters so as to
  439. // avoid an additional, superfluous, new line at the end of
  440. // the file.
  441. if(i < (this.csvData.size() - 1)) {
  442. bw.newLine();
  443. }
  444. }
  445. }
  446. }
  447. /**
  448. * Called to convert a row of cells into a line of data that can later be
  449. * output to the CSV file.
  450. *
  451. * @param row An instance of either the HSSFRow or XSSFRow classes that
  452. * encapsulates information about a row of cells recovered from
  453. * an Excel workbook.
  454. */
  455. private void rowToCSV(Row row) {
  456. Cell cell;
  457. int lastCellNum;
  458. ArrayList<String> csvLine = new ArrayList<>();
  459. // Check to ensure that a row was recovered from the sheet as it is
  460. // possible that one or more rows between other populated rows could be
  461. // missing - blank. If the row does contain cells then...
  462. if(row != null) {
  463. // Get the index for the right most cell on the row and then
  464. // step along the row from left to right recovering the contents
  465. // of each cell, converting that into a formatted String and
  466. // then storing the String into the csvLine ArrayList.
  467. lastCellNum = row.getLastCellNum();
  468. for(int i = 0; i <= lastCellNum; i++) {
  469. cell = row.getCell(i);
  470. if(cell == null) {
  471. csvLine.add("");
  472. }
  473. else {
  474. if(cell.getCellType() != CellType.FORMULA) {
  475. csvLine.add(this.formatter.formatCellValue(cell));
  476. }
  477. else {
  478. csvLine.add(this.formatter.formatCellValue(cell, this.evaluator));
  479. }
  480. }
  481. }
  482. // Make a note of the index number of the right most cell. This value
  483. // will later be used to ensure that the matrix of data in the CSV file
  484. // is square.
  485. if(lastCellNum > this.maxRowWidth) {
  486. this.maxRowWidth = lastCellNum;
  487. }
  488. }
  489. this.csvData.add(csvLine);
  490. }
  491. /**
  492. * Checks to see whether the field - which consists of the formatted
  493. * contents of an Excel worksheet cell encapsulated within a String - contains
  494. * any embedded characters that must be escaped. The method is able to
  495. * comply with either Excel's or UNIX formatting conventions in the
  496. * following manner;
  497. *
  498. * With regard to UNIX conventions, if the field contains any embedded
  499. * field separator or EOL characters they will each be escaped by prefixing
  500. * a leading backspace character. These are the only changes that have yet
  501. * emerged following some research as being required.
  502. *
  503. * Excel has other embedded character escaping requirements, some that emerged
  504. * from empirical testing, other through research. Firstly, with regards to
  505. * any embedded speech marks ("), each occurrence should be escaped with
  506. * another speech mark and the whole field then surrounded with speech marks.
  507. * Thus if a field holds <em>"Hello" he said</em> then it should be modified
  508. * to appear as <em>"""Hello"" he said"</em>. Furthermore, if the field
  509. * contains either embedded separator or EOL characters, it should also
  510. * be surrounded with speech marks. As a result <em>1,400</em> would become
  511. * <em>"1,400"</em> assuming that the comma is the required field separator.
  512. * This has one consequence in, if a field contains embedded speech marks
  513. * and embedded separator characters, checks for both are not required as the
  514. * additional set of speech marks that should be placed around ay field
  515. * containing embedded speech marks will also account for the embedded
  516. * separator.
  517. *
  518. * It is worth making one further note with regard to embedded EOL
  519. * characters. If the data in a worksheet is exported as a CSV file using
  520. * Excel itself, then the field will be surounded with speech marks. If the
  521. * resulting CSV file is then re-imports into another worksheet, the EOL
  522. * character will result in the original simgle field occupying more than
  523. * one cell. This same 'feature' is replicated in this classes behaviour.
  524. *
  525. * @param field An instance of the String class encapsulating the formatted
  526. * contents of a cell on an Excel worksheet.
  527. * @return A String that encapsulates the formatted contents of that
  528. * Excel worksheet cell but with any embedded separator, EOL or
  529. * speech mark characters correctly escaped.
  530. */
  531. private String escapeEmbeddedCharacters(String field) {
  532. StringBuilder buffer;
  533. // If the fields contents should be formatted to confrom with Excel's
  534. // convention....
  535. if(this.formattingConvention == ToCSV.EXCEL_STYLE_ESCAPING) {
  536. // Firstly, check if there are any speech marks (") in the field;
  537. // each occurrence must be escaped with another set of spech marks
  538. // and then the entire field should be enclosed within another
  539. // set of speech marks. Thus, "Yes" he said would become
  540. // """Yes"" he said"
  541. if(field.contains("\"")) {
  542. buffer = new StringBuilder(field.replace("\"", "\\\"\\\""));
  543. buffer.insert(0, "\"");
  544. buffer.append("\"");
  545. }
  546. else {
  547. // If the field contains either embedded separator or EOL
  548. // characters, then escape the whole field by surrounding it
  549. // with speech marks.
  550. buffer = new StringBuilder(field);
  551. if((buffer.indexOf(this.separator)) > -1 ||
  552. (buffer.indexOf("\n")) > -1) {
  553. buffer.insert(0, "\"");
  554. buffer.append("\"");
  555. }
  556. }
  557. return(buffer.toString().trim());
  558. }
  559. // The only other formatting convention this class obeys is the UNIX one
  560. // where any occurrence of the field separator or EOL character will
  561. // be escaped by preceding it with a backslash.
  562. else {
  563. if(field.contains(this.separator)) {
  564. field = field.replaceAll(this.separator, ("\\\\" + this.separator));
  565. }
  566. if(field.contains("\n")) {
  567. field = field.replace("\n", "\\\\\n");
  568. }
  569. return(field);
  570. }
  571. }
  572. /**
  573. * The main() method contains code that demonstrates how to use the class.
  574. *
  575. * @param args An array containing zero, one or more elements all of type
  576. * String. Each element will encapsulate an argument specified by the
  577. * user when running the program from the command prompt.
  578. */
  579. public static void main(String[] args) {
  580. // Check the number of arguments passed to the main method. There
  581. // must be two, three or four; the name of and path to either the folder
  582. // containing the Excel files or an individual Excel workbook that is/are
  583. // to be converted, the name of and path to the folder to which the CSV
  584. // files should be written, - optionally - the separator character
  585. // that should be used to separate individual items (fields) on the
  586. // lines (records) of the CSV file and - again optionally - an integer
  587. // that idicates whether the CSV file ought to obey Excel's or UNIX
  588. // convnetions with regard to formatting fields that contain embedded
  589. // separator, Speech mark or EOL character(s).
  590. //
  591. // Note that the names of the CSV files will be derived from those
  592. // of the Excel file(s). Put simply the .xls or .xlsx extension will be
  593. // replaced with .csv. Therefore, if the source folder contains files
  594. // with matching names but different extensions - Test.xls and Test.xlsx
  595. // for example - then the CSV file generated from one will overwrite
  596. // that generated from the other.
  597. ToCSV converter;
  598. boolean converted = true;
  599. long startTime = System.currentTimeMillis();
  600. try {
  601. converter = new ToCSV();
  602. if(args.length == 2) {
  603. // Just the Source File/Folder and Destination Folder were
  604. // passed to the main method.
  605. converter.convertExcelToCSV(args[0], args[1]);
  606. }
  607. else if(args.length == 3) {
  608. // The Source File/Folder, Destination Folder and Separator
  609. // were passed to the main method.
  610. converter.convertExcelToCSV(args[0], args[1], args[2]);
  611. }
  612. else if(args.length == 4) {
  613. // The Source File/Folder, Destination Folder, Separator and
  614. // Formatting Convention were passed to the main method.
  615. converter.convertExcelToCSV(args[0], args[1],
  616. args[2], Integer.parseInt(args[3]));
  617. }
  618. else {
  619. // None or more than four parameters were passed so display
  620. //a Usage message.
  621. System.out.println("Usage: java ToCSV [Source File/Folder] " +
  622. "[Destination Folder] [Separator] [Formatting Convention]\n" +
  623. "\tSource File/Folder\tThis argument should contain the name of and\n" +
  624. "\t\t\t\tpath to either a single Excel workbook or a\n" +
  625. "\t\t\t\tfolder containing one or more Excel workbooks.\n" +
  626. "\tDestination Folder\tThe name of and path to the folder that the\n" +
  627. "\t\t\t\tCSV files should be written out into. The\n" +
  628. "\t\t\t\tfolder must exist before running the ToCSV\n" +
  629. "\t\t\t\tcode as it will not check for or create it.\n" +
  630. "\tSeparator\t\tOptional. The character or characters that\n" +
  631. "\t\t\t\tshould be used to separate fields in the CSV\n" +
  632. "\t\t\t\trecord. If no value is passed then the comma\n" +
  633. "\t\t\t\twill be assumed.\n" +
  634. "\tFormatting Convention\tOptional. This argument can take one of two\n" +
  635. "\t\t\t\tvalues. Passing 0 (zero) will result in a CSV\n" +
  636. "\t\t\t\tfile that obeys Excel's formatting conventions\n" +
  637. "\t\t\t\twhilst passing 1 (one) will result in a file\n" +
  638. "\t\t\t\tthat obeys UNIX formatting conventions. If no\n" +
  639. "\t\t\t\tvalue is passed, then the CSV file produced\n" +
  640. "\t\t\t\twill obey Excel's formatting conventions.");
  641. converted = false;
  642. }
  643. }
  644. // It is not wise to have such a wide catch clause - Exception is very
  645. // close to being at the top of the inheritance hierarchy - though it
  646. // will suffice for this example as it is really not possible to recover
  647. // easily from an exceptional set of circumstances at this point in the
  648. // program. It should however, ideally be replaced with one or more
  649. // catch clauses optimised to handle more specific problems.
  650. catch(Exception ex) {
  651. LOG.atWarn().withThrowable(ex).log("Unexpected exception");
  652. converted = false;
  653. }
  654. if (converted) {
  655. System.out.println("Conversion took " +
  656. ((System.currentTimeMillis() - startTime)/1000) + " seconds");
  657. }
  658. }
  659. /**
  660. * An instance of this class can be used to control the files returned
  661. * be a call to the listFiles() method when made on an instance of the
  662. * File class and that object refers to a folder/directory
  663. */
  664. static class ExcelFilenameFilter implements FilenameFilter {
  665. /**
  666. * Determine those files that will be returned by a call to the
  667. * listFiles() method. In this case, the name of the file must end with
  668. * either of the following two extension; '.xls' or '.xlsx'. For the
  669. * future, it is very possible to parameterise this and allow the
  670. * containing class to pass, for example, an array of Strings to this
  671. * class on instantiation. Each element in that array could encapsulate
  672. * a valid file extension - '.xls', '.xlsx', '.xlt', '.xlst', etc. These
  673. * could then be used to control which files were returned by the call
  674. * to the listFiles() method.
  675. *
  676. * @param file An instance of the File class that encapsulates a handle
  677. * referring to the folder/directory that contains the file.
  678. * @param name An instance of the String class that encapsulates the
  679. * name of the file.
  680. * @return A boolean value that indicates whether the file should be
  681. * included in the array retirned by the call to the listFiles()
  682. * method. In this case true will be returned if the name of the
  683. * file ends with either '.xls' or '.xlsx' and false will be
  684. * returned in all other instances.
  685. */
  686. @Override
  687. public boolean accept(File file, String name) {
  688. return(name.endsWith(".xls") || name.endsWith(".xlsx"));
  689. }
  690. }
  691. }