You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ToCSV.java 37KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.examples.ss;
  16. import java.io.BufferedWriter;
  17. import java.io.File;
  18. import java.io.FileInputStream;
  19. import java.io.FileNotFoundException;
  20. import java.io.FilenameFilter;
  21. import java.io.IOException;
  22. import java.nio.charset.StandardCharsets;
  23. import java.nio.file.Files;
  24. import java.util.ArrayList;
  25. import org.apache.poi.ss.usermodel.Cell;
  26. import org.apache.poi.ss.usermodel.CellType;
  27. import org.apache.poi.ss.usermodel.DataFormatter;
  28. import org.apache.poi.ss.usermodel.FormulaEvaluator;
  29. import org.apache.poi.ss.usermodel.Row;
  30. import org.apache.poi.ss.usermodel.Sheet;
  31. import org.apache.poi.ss.usermodel.Workbook;
  32. import org.apache.poi.ss.usermodel.WorkbookFactory;
  33. /**
  34. * Demonstrates <em>one</em> way to convert an Excel spreadsheet into a CSV
  35. * file. This class makes the following assumptions;
  36. * <list>
  37. * <li>1. Where the Excel workbook contains more than one worksheet, then a single
  38. * CSV file will contain the data from all of the worksheets.</li>
  39. * <li>2. The data matrix contained in the CSV file will be square. This means that
  40. * the number of fields in each record of the CSV file will match the number
  41. * of cells in the longest row found in the Excel workbook. Any short records
  42. * will be 'padded' with empty fields - an empty field is represented in
  43. * the CSV file in this way - ,,.</li>
  44. * <li>3. Empty fields will represent missing cells.</li>
  45. * <li>4. A record consisting of empty fields will be used to represent an empty row
  46. * in the Excel workbook.</li>
  47. * </list>
  48. * Therefore, if the worksheet looked like this;
  49. *
  50. * <pre>
  51. * ___________________________________________
  52. * | | | | | |
  53. * | A | B | C | D | E |
  54. * ___|_______|_______|_______|_______|_______|
  55. * | | | | | |
  56. * 1 | 1 | 2 | 3 | 4 | 5 |
  57. * ___|_______|_______|_______|_______|_______|
  58. * | | | | | |
  59. * 2 | | | | | |
  60. * ___|_______|_______|_______|_______|_______|
  61. * | | | | | |
  62. * 3 | | A | | B | |
  63. * ___|_______|_______|_______|_______|_______|
  64. * | | | | | |
  65. * 4 | | | | | Z |
  66. * ___|_______|_______|_______|_______|_______|
  67. * | | | | | |
  68. * 5 | 1,400 | | 250 | | |
  69. * ___|_______|_______|_______|_______|_______|
  70. *
  71. * </pre>
  72. *
  73. * Then, the resulting CSV file will contain the following lines (records);
  74. * <pre>
  75. * 1,2,3,4,5
  76. * ,,,,
  77. * ,A,,B,
  78. * ,,,,Z
  79. * "1,400",,250,,
  80. * </pre><p>
  81. * Typically, the comma is used to separate each of the fields that, together,
  82. * constitute a single record or line within the CSV file. This is not however
  83. * a hard and fast rule and so this class allows the user to determine which
  84. * character is used as the field separator and assumes the comma if none other
  85. * is specified.
  86. * </p><p>
  87. * If a field contains the separator then it will be escaped. If the file should
  88. * obey Excel's CSV formatting rules, then the field will be surrounded with
  89. * speech marks whilst if it should obey UNIX conventions, each occurrence of
  90. * the separator will be preceded by the backslash character.
  91. * </p><p>
  92. * If a field contains an end of line (EOL) character then it too will be
  93. * escaped. If the file should obey Excel's CSV formatting rules then the field
  94. * will again be surrounded by speech marks. On the other hand, if the file
  95. * should follow UNIX conventions then a single backslash will precede the
  96. * EOL character. There is no single applicable standard for UNIX and some
  97. * applications replace the CR with \r and the LF with \n but this class will
  98. * not do so.
  99. * </p><p>
  100. * If the field contains double quotes then that character will be escaped. It
  101. * seems as though UNIX does not define a standard for this whilst Excel does.
  102. * Should the CSV file have to obey Excel's formatting rules then the speech
  103. * mark character will be escaped with a second set of speech marks. Finally, an
  104. * enclosing set of speech marks will also surround the entire field. Thus, if
  105. * the following line of text appeared in a cell - "Hello" he said - it would
  106. * look like this when converted into a field within a CSV file - """Hello"" he
  107. * said".
  108. * </p><p>
  109. * Finally, it is worth noting that talk of CSV 'standards' is really slightly
  110. * misleading as there is no such thing. It may well be that the code in this
  111. * class has to be modified to produce files to suit a specific application
  112. * or requirement.
  113. * </p>
  114. * @author Mark B
  115. * @version 1.00 9th April 2010
  116. * 1.10 13th April 2010 - Added support for processing all Excel
  117. * workbooks in a folder along with the ability
  118. * to specify a field separator character.
  119. * 2.00 14th April 2010 - Added support for embedded characters; the
  120. * field separator, EOL and double quotes or
  121. * speech marks. In addition, gave the client
  122. * the ability to select how these are handled,
  123. * either obeying Excel's or UNIX formatting
  124. * conventions.
  125. */
  126. @SuppressWarnings({"java:S106","java:S4823","java:S1192"})
  127. public class ToCSV {
  128. private Workbook workbook;
  129. private ArrayList<ArrayList<String>> csvData;
  130. private int maxRowWidth;
  131. private int formattingConvention;
  132. private DataFormatter formatter;
  133. private FormulaEvaluator evaluator;
  134. private String separator;
  135. private static final String CSV_FILE_EXTENSION = ".csv";
  136. private static final String DEFAULT_SEPARATOR = ",";
  137. /**
  138. * Identifies that the CSV file should obey Excel's formatting conventions
  139. * with regard to escaping certain embedded characters - the field separator,
  140. * speech mark and end of line (EOL) character
  141. */
  142. public static final int EXCEL_STYLE_ESCAPING = 0;
  143. /**
  144. * Identifies that the CSV file should obey UNIX formatting conventions
  145. * with regard to escaping certain embedded characters - the field separator
  146. * and end of line (EOL) character
  147. */
  148. public static final int UNIX_STYLE_ESCAPING = 1;
  149. /**
  150. * Process the contents of a folder, convert the contents of each Excel
  151. * workbook into CSV format and save the resulting file to the specified
  152. * folder using the same name as the original workbook with the .xls or
  153. * .xlsx extension replaced by .csv. This method will ensure that the
  154. * CSV file created contains the comma field separator and that embedded
  155. * characters such as the field separator, the EOL and double quotes are
  156. * escaped in accordance with Excel's convention.
  157. *
  158. * @param strSource An instance of the String class that encapsulates the
  159. * name of and path to either a folder containing those Excel
  160. * workbook(s) or the name of and path to an individual Excel workbook
  161. * that is/are to be converted.
  162. * @param strDestination An instance of the String class encapsulating the
  163. * name of and path to a folder that will contain the resulting CSV
  164. * files.
  165. * @throws java.io.FileNotFoundException Thrown if any file cannot be located
  166. * on the filesystem during processing.
  167. * @throws java.io.IOException Thrown if the filesystem encounters any
  168. * problems during processing.
  169. * @throws java.lang.IllegalArgumentException Thrown if the values passed
  170. * to the strSource parameter refers to a file or folder that does not
  171. * exist or if the value passed to the strDestination paramater refers
  172. * to a folder that does not exist or simply does not refer to a
  173. * folder.
  174. */
  175. public void convertExcelToCSV(String strSource, String strDestination)
  176. throws FileNotFoundException, IOException, IllegalArgumentException {
  177. // Simply chain the call to the overloaded convertExcelToCSV(String,
  178. // String, String, int) method, pass the default separator and ensure
  179. // that certain embedded characters are escaped in accordance with
  180. // Excel's formatting conventions
  181. this.convertExcelToCSV(strSource, strDestination,
  182. ToCSV.DEFAULT_SEPARATOR, ToCSV.EXCEL_STYLE_ESCAPING);
  183. }
  184. /**
  185. * Process the contents of a folder, convert the contents of each Excel
  186. * workbook into CSV format and save the resulting file to the specified
  187. * folder using the same name as the original workbook with the .xls or
  188. * .xlsx extension replaced by .csv. This method allows the client to
  189. * define the field separator but will ensure that embedded characters such
  190. * as the field separator, the EOL and double quotes are escaped in
  191. * accordance with Excel's convention.
  192. *
  193. * @param strSource An instance of the String class that encapsulates the
  194. * name of and path to either a folder containing those Excel
  195. * workbook(s) or the name of and path to an individual Excel workbook
  196. * that is/are to be converted.
  197. * @param strDestination An instance of the String class encapsulating the
  198. * name of and path to a folder that will contain the resulting CSV
  199. * files.
  200. * @param separator An instance of the String class that encapsulates the
  201. * character or characters the client wishes to use as the field
  202. * separator.
  203. * @throws java.io.FileNotFoundException Thrown if any file cannot be located
  204. * on the filesystem during processing.
  205. * @throws java.io.IOException Thrown if the filesystem encounters any
  206. * problems during processing.
  207. * @throws java.lang.IllegalArgumentException Thrown if the values passed
  208. * to the strSource parameter refers to a file or folder that does not
  209. * exist or if the value passed to the strDestination paramater refers
  210. * to a folder that does not exist or simply does not refer to a
  211. * folder.
  212. */
  213. public void convertExcelToCSV(String strSource, String strDestination,
  214. String separator)
  215. throws FileNotFoundException, IOException,
  216. IllegalArgumentException {
  217. // Simply chain the call to the overloaded convertExcelToCSV(String,
  218. // String, String, int) method and ensure that certain embedded
  219. // characters are escaped in accordance with Excel's formatting
  220. // conventions
  221. this.convertExcelToCSV(strSource, strDestination,
  222. separator, ToCSV.EXCEL_STYLE_ESCAPING);
  223. }
  224. /**
  225. * Process the contents of a folder, convert the contents of each Excel
  226. * workbook into CSV format and save the resulting file to the specified
  227. * folder using the same name as the original workbook with the .xls or
  228. * .xlsx extension replaced by .csv
  229. *
  230. * @param strSource An instance of the String class that encapsulates the
  231. * name of and path to either a folder containing those Excel
  232. * workbook(s) or the name of and path to an individual Excel workbook
  233. * that is/are to be converted.
  234. * @param strDestination An instance of the String class encapsulating the name
  235. * of and path to a folder that will contain the resulting CSV files.
  236. * @param formattingConvention A primitive int whose value will determine
  237. * whether certain embedded characters should be escaped in accordance
  238. * with Excel's or UNIX formatting conventions. Two constants are
  239. * defined to support this option; ToCSV.EXCEL_STYLE_ESCAPING and
  240. * ToCSV.UNIX_STYLE_ESCAPING
  241. * @param separator An instance of the String class encapsulating the
  242. * characters or characters that should be used to separate items
  243. * on a line within the CSV file.
  244. * @throws java.io.FileNotFoundException Thrown if any file cannot be located
  245. * on the filesystem during processing.
  246. * @throws java.io.IOException Thrown if the filesystem encounters any
  247. * problems during processing.
  248. * @throws java.lang.IllegalArgumentException Thrown if the values passed
  249. * to the strSource parameter refers to a file or folder that does not
  250. * exist, if the value passed to the strDestination paramater refers
  251. * to a folder that does not exist, if the value passed to the
  252. * strDestination parameter does not refer to a folder or if the
  253. * value passed to the formattingConvention parameter is other than
  254. * one of the values defined by the constants ToCSV.EXCEL_STYLE_ESCAPING
  255. * and ToCSV.UNIX_STYLE_ESCAPING.
  256. */
  257. public void convertExcelToCSV(String strSource, String strDestination,
  258. String separator, int formattingConvention)
  259. throws FileNotFoundException, IOException,
  260. IllegalArgumentException {
  261. // Check that the source file/folder exists.
  262. File source = new File(strSource);
  263. if(!source.exists()) {
  264. throw new IllegalArgumentException("The source for the Excel " +
  265. "file(s) cannot be found at " + source);
  266. }
  267. // Ensure thaat the folder the user has chosen to save the CSV files
  268. // away into firstly exists and secondly is a folder rather than, for
  269. // instance, a data file.
  270. File destination = new File(strDestination);
  271. if(!destination.exists()) {
  272. throw new IllegalArgumentException("The destination directory " + destination + " for the " +
  273. "converted CSV file(s) does not exist.");
  274. }
  275. if(!destination.isDirectory()) {
  276. throw new IllegalArgumentException("The destination " + destination + " for the CSV " +
  277. "file(s) is not a directory/folder.");
  278. }
  279. // Ensure the value passed to the formattingConvention parameter is
  280. // within range.
  281. if(formattingConvention != ToCSV.EXCEL_STYLE_ESCAPING &&
  282. formattingConvention != ToCSV.UNIX_STYLE_ESCAPING) {
  283. throw new IllegalArgumentException("The value passed to the " +
  284. "formattingConvention parameter is out of range: " + formattingConvention + ", expecting one of " +
  285. ToCSV.EXCEL_STYLE_ESCAPING + " or " + ToCSV.UNIX_STYLE_ESCAPING);
  286. }
  287. // Copy the spearator character and formatting convention into local
  288. // variables for use in other methods.
  289. this.separator = separator;
  290. this.formattingConvention = formattingConvention;
  291. // Check to see if the sourceFolder variable holds a reference to
  292. // a file or a folder full of files.
  293. final File[] filesList;
  294. if(source.isDirectory()) {
  295. // Get a list of all of the Excel spreadsheet files (workbooks) in
  296. // the source folder/directory
  297. filesList = source.listFiles(new ExcelFilenameFilter());
  298. }
  299. else {
  300. // Assume that it must be a file handle - although there are other
  301. // options the code should perhaps check - and store the reference
  302. // into the filesList variable.
  303. filesList = new File[]{source};
  304. }
  305. // Step through each of the files in the source folder and for each
  306. // open the workbook, convert it's contents to CSV format and then
  307. // save the resulting file away into the folder specified by the
  308. // contents of the destination variable. Note that the name of the
  309. // csv file will be created by taking the name of the Excel file,
  310. // removing the extension and replacing it with .csv. Note that there
  311. // is one drawback with this approach; if the folder holding the files
  312. // contains two workbooks whose names match but one is a binary file
  313. // (.xls) and the other a SpreadsheetML file (.xlsx), then the names
  314. // for both CSV files will be identical and one CSV file will,
  315. // therefore, over-write the other.
  316. if (filesList != null) {
  317. for(File excelFile : filesList) {
  318. // Open the workbook
  319. this.openWorkbook(excelFile);
  320. // Convert it's contents into a CSV file
  321. this.convertToCSV();
  322. // Build the name of the csv folder from that of the Excel workbook.
  323. // Simply replace the .xls or .xlsx file extension with .csv
  324. String destinationFilename = excelFile.getName();
  325. destinationFilename = destinationFilename.substring(
  326. 0, destinationFilename.lastIndexOf('.')) +
  327. ToCSV.CSV_FILE_EXTENSION;
  328. // Save the CSV file away using the newly constricted file name
  329. // and to the specified directory.
  330. this.saveCSVFile(new File(destination, destinationFilename));
  331. }
  332. }
  333. }
  334. /**
  335. * Open an Excel workbook ready for conversion.
  336. *
  337. * @param file An instance of the File class that encapsulates a handle
  338. * to a valid Excel workbook. Note that the workbook can be in
  339. * either binary (.xls) or SpreadsheetML (.xlsx) format.
  340. * @throws java.io.FileNotFoundException Thrown if the file cannot be located.
  341. * @throws java.io.IOException Thrown if a problem occurs in the file system.
  342. */
  343. private void openWorkbook(File file) throws FileNotFoundException, IOException {
  344. System.out.println("Opening workbook [" + file.getName() + "]");
  345. try (FileInputStream fis = new FileInputStream(file)) {
  346. // Open the workbook and then create the FormulaEvaluator and
  347. // DataFormatter instances that will be needed to, respectively,
  348. // force evaluation of forumlae found in cells and create a
  349. // formatted String encapsulating the cells contents.
  350. this.workbook = WorkbookFactory.create(fis);
  351. this.evaluator = this.workbook.getCreationHelper().createFormulaEvaluator();
  352. this.formatter = new DataFormatter(true);
  353. }
  354. }
  355. /**
  356. * Called to convert the contents of the currently opened workbook into
  357. * a CSV file.
  358. */
  359. private void convertToCSV() {
  360. Sheet sheet;
  361. Row row;
  362. int lastRowNum;
  363. this.csvData = new ArrayList<>();
  364. System.out.println("Converting files contents to CSV format.");
  365. // Discover how many sheets there are in the workbook....
  366. int numSheets = this.workbook.getNumberOfSheets();
  367. // and then iterate through them.
  368. for(int i = 0; i < numSheets; i++) {
  369. // Get a reference to a sheet and check to see if it contains
  370. // any rows.
  371. sheet = this.workbook.getSheetAt(i);
  372. if(sheet.getPhysicalNumberOfRows() > 0) {
  373. // Note down the index number of the bottom-most row and
  374. // then iterate through all of the rows on the sheet starting
  375. // from the very first row - number 1 - even if it is missing.
  376. // Recover a reference to the row and then call another method
  377. // which will strip the data from the cells and build lines
  378. // for inclusion in the resylting CSV file.
  379. lastRowNum = sheet.getLastRowNum();
  380. for(int j = 0; j <= lastRowNum; j++) {
  381. row = sheet.getRow(j);
  382. this.rowToCSV(row);
  383. }
  384. }
  385. }
  386. }
  387. /**
  388. * Called to actually save the data recovered from the Excel workbook
  389. * as a CSV file.
  390. *
  391. * @param file An instance of the File class that encapsulates a handle
  392. * referring to the CSV file.
  393. * @throws java.io.FileNotFoundException Thrown if the file cannot be found.
  394. * @throws java.io.IOException Thrown to indicate and error occurred in the
  395. * underylying file system.
  396. */
  397. private void saveCSVFile(File file) throws FileNotFoundException, IOException {
  398. ArrayList<String> line;
  399. StringBuilder buffer;
  400. String csvLineElement;
  401. // Open a writer onto the CSV file.
  402. try (BufferedWriter bw = Files.newBufferedWriter(file.toPath(), StandardCharsets.ISO_8859_1)) {
  403. System.out.println("Saving the CSV file [" + file.getName() + "]");
  404. // Step through the elements of the ArrayList that was used to hold
  405. // all of the data recovered from the Excel workbooks' sheets, rows
  406. // and cells.
  407. for(int i = 0; i < this.csvData.size(); i++) {
  408. buffer = new StringBuilder();
  409. // Get an element from the ArrayList that contains the data for
  410. // the workbook. This element will itself be an ArrayList
  411. // containing Strings and each String will hold the data recovered
  412. // from a single cell. The for() loop is used to recover elements
  413. // from this 'row' ArrayList one at a time and to write the Strings
  414. // away to a StringBuilder thus assembling a single line for inclusion
  415. // in the CSV file. If a row was empty or if it was short, then
  416. // the ArrayList that contains it's data will also be shorter than
  417. // some of the others. Therefore, it is necessary to check within
  418. // the for loop to ensure that the ArrayList contains data to be
  419. // processed. If it does, then an element will be recovered and
  420. // appended to the StringBuilder.
  421. line = this.csvData.get(i);
  422. for(int j = 0; j < this.maxRowWidth; j++) {
  423. if(line.size() > j) {
  424. csvLineElement = line.get(j);
  425. if(csvLineElement != null) {
  426. buffer.append(this.escapeEmbeddedCharacters(
  427. csvLineElement));
  428. }
  429. }
  430. if(j < (this.maxRowWidth - 1)) {
  431. buffer.append(this.separator);
  432. }
  433. }
  434. // Once the line is built, write it away to the CSV file.
  435. bw.write(buffer.toString().trim());
  436. // Condition the inclusion of new line characters so as to
  437. // avoid an additional, superfluous, new line at the end of
  438. // the file.
  439. if(i < (this.csvData.size() - 1)) {
  440. bw.newLine();
  441. }
  442. }
  443. }
  444. }
  445. /**
  446. * Called to convert a row of cells into a line of data that can later be
  447. * output to the CSV file.
  448. *
  449. * @param row An instance of either the HSSFRow or XSSFRow classes that
  450. * encapsulates information about a row of cells recovered from
  451. * an Excel workbook.
  452. */
  453. private void rowToCSV(Row row) {
  454. Cell cell;
  455. int lastCellNum;
  456. ArrayList<String> csvLine = new ArrayList<>();
  457. // Check to ensure that a row was recovered from the sheet as it is
  458. // possible that one or more rows between other populated rows could be
  459. // missing - blank. If the row does contain cells then...
  460. if(row != null) {
  461. // Get the index for the right most cell on the row and then
  462. // step along the row from left to right recovering the contents
  463. // of each cell, converting that into a formatted String and
  464. // then storing the String into the csvLine ArrayList.
  465. lastCellNum = row.getLastCellNum();
  466. for(int i = 0; i <= lastCellNum; i++) {
  467. cell = row.getCell(i);
  468. if(cell == null) {
  469. csvLine.add("");
  470. }
  471. else {
  472. if(cell.getCellType() != CellType.FORMULA) {
  473. csvLine.add(this.formatter.formatCellValue(cell));
  474. }
  475. else {
  476. csvLine.add(this.formatter.formatCellValue(cell, this.evaluator));
  477. }
  478. }
  479. }
  480. // Make a note of the index number of the right most cell. This value
  481. // will later be used to ensure that the matrix of data in the CSV file
  482. // is square.
  483. if(lastCellNum > this.maxRowWidth) {
  484. this.maxRowWidth = lastCellNum;
  485. }
  486. }
  487. this.csvData.add(csvLine);
  488. }
  489. /**
  490. * Checks to see whether the field - which consists of the formatted
  491. * contents of an Excel worksheet cell encapsulated within a String - contains
  492. * any embedded characters that must be escaped. The method is able to
  493. * comply with either Excel's or UNIX formatting conventions in the
  494. * following manner;
  495. *
  496. * With regard to UNIX conventions, if the field contains any embedded
  497. * field separator or EOL characters they will each be escaped by prefixing
  498. * a leading backspace character. These are the only changes that have yet
  499. * emerged following some research as being required.
  500. *
  501. * Excel has other embedded character escaping requirements, some that emerged
  502. * from empirical testing, other through research. Firstly, with regards to
  503. * any embedded speech marks ("), each occurrence should be escaped with
  504. * another speech mark and the whole field then surrounded with speech marks.
  505. * Thus if a field holds <em>"Hello" he said</em> then it should be modified
  506. * to appear as <em>"""Hello"" he said"</em>. Furthermore, if the field
  507. * contains either embedded separator or EOL characters, it should also
  508. * be surrounded with speech marks. As a result <em>1,400</em> would become
  509. * <em>"1,400"</em> assuming that the comma is the required field separator.
  510. * This has one consequence in, if a field contains embedded speech marks
  511. * and embedded separator characters, checks for both are not required as the
  512. * additional set of speech marks that should be placed around ay field
  513. * containing embedded speech marks will also account for the embedded
  514. * separator.
  515. *
  516. * It is worth making one further note with regard to embedded EOL
  517. * characters. If the data in a worksheet is exported as a CSV file using
  518. * Excel itself, then the field will be surounded with speech marks. If the
  519. * resulting CSV file is then re-imports into another worksheet, the EOL
  520. * character will result in the original simgle field occupying more than
  521. * one cell. This same 'feature' is replicated in this classes behaviour.
  522. *
  523. * @param field An instance of the String class encapsulating the formatted
  524. * contents of a cell on an Excel worksheet.
  525. * @return A String that encapsulates the formatted contents of that
  526. * Excel worksheet cell but with any embedded separator, EOL or
  527. * speech mark characters correctly escaped.
  528. */
  529. private String escapeEmbeddedCharacters(String field) {
  530. StringBuilder buffer;
  531. // If the fields contents should be formatted to confrom with Excel's
  532. // convention....
  533. if(this.formattingConvention == ToCSV.EXCEL_STYLE_ESCAPING) {
  534. // Firstly, check if there are any speech marks (") in the field;
  535. // each occurrence must be escaped with another set of spech marks
  536. // and then the entire field should be enclosed within another
  537. // set of speech marks. Thus, "Yes" he said would become
  538. // """Yes"" he said"
  539. if(field.contains("\"")) {
  540. buffer = new StringBuilder(field.replaceAll("\"", "\\\"\\\""));
  541. buffer.insert(0, "\"");
  542. buffer.append("\"");
  543. }
  544. else {
  545. // If the field contains either embedded separator or EOL
  546. // characters, then escape the whole field by surrounding it
  547. // with speech marks.
  548. buffer = new StringBuilder(field);
  549. if((buffer.indexOf(this.separator)) > -1 ||
  550. (buffer.indexOf("\n")) > -1) {
  551. buffer.insert(0, "\"");
  552. buffer.append("\"");
  553. }
  554. }
  555. return(buffer.toString().trim());
  556. }
  557. // The only other formatting convention this class obeys is the UNIX one
  558. // where any occurrence of the field separator or EOL character will
  559. // be escaped by preceding it with a backslash.
  560. else {
  561. if(field.contains(this.separator)) {
  562. field = field.replaceAll(this.separator, ("\\\\" + this.separator));
  563. }
  564. if(field.contains("\n")) {
  565. field = field.replaceAll("\n", "\\\\\n");
  566. }
  567. return(field);
  568. }
  569. }
  570. /**
  571. * The main() method contains code that demonstrates how to use the class.
  572. *
  573. * @param args An array containing zero, one or more elements all of type
  574. * String. Each element will encapsulate an argument specified by the
  575. * user when running the program from the command prompt.
  576. */
  577. public static void main(String[] args) {
  578. // Check the number of arguments passed to the main method. There
  579. // must be two, three or four; the name of and path to either the folder
  580. // containing the Excel files or an individual Excel workbook that is/are
  581. // to be converted, the name of and path to the folder to which the CSV
  582. // files should be written, - optionally - the separator character
  583. // that should be used to separate individual items (fields) on the
  584. // lines (records) of the CSV file and - again optionally - an integer
  585. // that idicates whether the CSV file ought to obey Excel's or UNIX
  586. // convnetions with regard to formatting fields that contain embedded
  587. // separator, Speech mark or EOL character(s).
  588. //
  589. // Note that the names of the CSV files will be derived from those
  590. // of the Excel file(s). Put simply the .xls or .xlsx extension will be
  591. // replaced with .csv. Therefore, if the source folder contains files
  592. // with matching names but different extensions - Test.xls and Test.xlsx
  593. // for example - then the CSV file generated from one will overwrite
  594. // that generated from the other.
  595. ToCSV converter;
  596. boolean converted = true;
  597. long startTime = System.currentTimeMillis();
  598. try {
  599. converter = new ToCSV();
  600. if(args.length == 2) {
  601. // Just the Source File/Folder and Destination Folder were
  602. // passed to the main method.
  603. converter.convertExcelToCSV(args[0], args[1]);
  604. }
  605. else if(args.length == 3) {
  606. // The Source File/Folder, Destination Folder and Separator
  607. // were passed to the main method.
  608. converter.convertExcelToCSV(args[0], args[1], args[2]);
  609. }
  610. else if(args.length == 4) {
  611. // The Source File/Folder, Destination Folder, Separator and
  612. // Formatting Convention were passed to the main method.
  613. converter.convertExcelToCSV(args[0], args[1],
  614. args[2], Integer.parseInt(args[3]));
  615. }
  616. else {
  617. // None or more than four parameters were passed so display
  618. //a Usage message.
  619. System.out.println("Usage: java ToCSV [Source File/Folder] " +
  620. "[Destination Folder] [Separator] [Formatting Convention]\n" +
  621. "\tSource File/Folder\tThis argument should contain the name of and\n" +
  622. "\t\t\t\tpath to either a single Excel workbook or a\n" +
  623. "\t\t\t\tfolder containing one or more Excel workbooks.\n" +
  624. "\tDestination Folder\tThe name of and path to the folder that the\n" +
  625. "\t\t\t\tCSV files should be written out into. The\n" +
  626. "\t\t\t\tfolder must exist before running the ToCSV\n" +
  627. "\t\t\t\tcode as it will not check for or create it.\n" +
  628. "\tSeparator\t\tOptional. The character or characters that\n" +
  629. "\t\t\t\tshould be used to separate fields in the CSV\n" +
  630. "\t\t\t\trecord. If no value is passed then the comma\n" +
  631. "\t\t\t\twill be assumed.\n" +
  632. "\tFormatting Convention\tOptional. This argument can take one of two\n" +
  633. "\t\t\t\tvalues. Passing 0 (zero) will result in a CSV\n" +
  634. "\t\t\t\tfile that obeys Excel's formatting conventions\n" +
  635. "\t\t\t\twhilst passing 1 (one) will result in a file\n" +
  636. "\t\t\t\tthat obeys UNIX formatting conventions. If no\n" +
  637. "\t\t\t\tvalue is passed, then the CSV file produced\n" +
  638. "\t\t\t\twill obey Excel's formatting conventions.");
  639. converted = false;
  640. }
  641. }
  642. // It is not wise to have such a wide catch clause - Exception is very
  643. // close to being at the top of the inheritance hierarchy - though it
  644. // will suffice for this example as it is really not possible to recover
  645. // easilly from an exceptional set of circumstances at this point in the
  646. // program. It should however, ideally be replaced with one or more
  647. // catch clauses optimised to handle more specific problems.
  648. catch(Exception ex) {
  649. System.out.println("Caught an: " + ex.getClass().getName());
  650. System.out.println("Message: " + ex.getMessage());
  651. System.out.println("Stacktrace follows:.....");
  652. ex.printStackTrace(System.out);
  653. converted = false;
  654. }
  655. if (converted) {
  656. System.out.println("Conversion took " +
  657. ((System.currentTimeMillis() - startTime)/1000) + " seconds");
  658. }
  659. }
  660. /**
  661. * An instance of this class can be used to control the files returned
  662. * be a call to the listFiles() method when made on an instance of the
  663. * File class and that object refers to a folder/directory
  664. */
  665. static class ExcelFilenameFilter implements FilenameFilter {
  666. /**
  667. * Determine those files that will be returned by a call to the
  668. * listFiles() method. In this case, the name of the file must end with
  669. * either of the following two extension; '.xls' or '.xlsx'. For the
  670. * future, it is very possible to parameterise this and allow the
  671. * containing class to pass, for example, an array of Strings to this
  672. * class on instantiation. Each element in that array could encapsulate
  673. * a valid file extension - '.xls', '.xlsx', '.xlt', '.xlst', etc. These
  674. * could then be used to control which files were returned by the call
  675. * to the listFiles() method.
  676. *
  677. * @param file An instance of the File class that encapsulates a handle
  678. * referring to the folder/directory that contains the file.
  679. * @param name An instance of the String class that encapsulates the
  680. * name of the file.
  681. * @return A boolean value that indicates whether the file should be
  682. * included in the array retirned by the call to the listFiles()
  683. * method. In this case true will be returned if the name of the
  684. * file ends with either '.xls' or '.xlsx' and false will be
  685. * returned in all other instances.
  686. */
  687. @Override
  688. public boolean accept(File file, String name) {
  689. return(name.endsWith(".xls") || name.endsWith(".xlsx"));
  690. }
  691. }
  692. }