You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ExcelExtractor.java 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.extractor;
  16. import java.io.File;
  17. import java.io.FileInputStream;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.io.PrintStream;
  21. import java.util.Locale;
  22. import org.apache.poi.extractor.POIOLE2TextExtractor;
  23. import org.apache.poi.hssf.usermodel.HSSFCell;
  24. import org.apache.poi.hssf.usermodel.HSSFCellStyle;
  25. import org.apache.poi.hssf.usermodel.HSSFComment;
  26. import org.apache.poi.hssf.usermodel.HSSFDataFormatter;
  27. import org.apache.poi.hssf.usermodel.HSSFRichTextString;
  28. import org.apache.poi.hssf.usermodel.HSSFRow;
  29. import org.apache.poi.hssf.usermodel.HSSFSheet;
  30. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  31. import org.apache.poi.poifs.filesystem.DirectoryNode;
  32. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  33. import org.apache.poi.ss.formula.eval.ErrorEval;
  34. import org.apache.poi.ss.usermodel.HeaderFooter;
  35. import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;
  36. /**
  37. * A text extractor for Excel files.
  38. * <p>
  39. * Returns the textual content of the file, suitable for
  40. * indexing by something like Lucene, but not really
  41. * intended for display to the user.
  42. * </p>
  43. * <p>
  44. * To turn an excel file into a CSV or similar, then see
  45. * the XLS2CSVmra example
  46. * </p>
  47. *
  48. * @see <a href="http://svn.apache.org/repos/asf/poi/trunk/poi-examples/src/main/java/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
  49. */
  50. public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
  51. private final HSSFWorkbook _wb;
  52. private final HSSFDataFormatter _formatter;
  53. private boolean doCloseFilesystem = true;
  54. private boolean _includeSheetNames = true;
  55. private boolean _shouldEvaluateFormulas = true;
  56. private boolean _includeCellComments;
  57. private boolean _includeBlankCells;
  58. private boolean _includeHeadersFooters = true;
  59. public ExcelExtractor(HSSFWorkbook wb) {
  60. _wb = wb;
  61. _formatter = new HSSFDataFormatter();
  62. }
  63. public ExcelExtractor(POIFSFileSystem fs) throws IOException {
  64. this(fs.getRoot());
  65. }
  66. public ExcelExtractor(DirectoryNode dir) throws IOException {
  67. this(new HSSFWorkbook(dir, true));
  68. }
  69. private static final class CommandParseException extends Exception {
  70. public CommandParseException(String msg) {
  71. super(msg);
  72. }
  73. }
  74. private static final class CommandArgs {
  75. private final boolean _requestHelp;
  76. private final File _inputFile;
  77. private final boolean _showSheetNames;
  78. private final boolean _evaluateFormulas;
  79. private final boolean _showCellComments;
  80. private final boolean _showBlankCells;
  81. private final boolean _headersFooters;
  82. public CommandArgs(String[] args) throws CommandParseException {
  83. int nArgs = args.length;
  84. File inputFile = null;
  85. boolean requestHelp = false;
  86. boolean showSheetNames = true;
  87. boolean evaluateFormulas = true;
  88. boolean showCellComments = false;
  89. boolean showBlankCells = false;
  90. boolean headersFooters = true;
  91. for (int i=0; i<nArgs; i++) {
  92. String arg = args[i];
  93. if ("-help".equalsIgnoreCase(arg)) {
  94. requestHelp = true;
  95. break;
  96. }
  97. if ("-i".equals(arg)) {
  98. // step to next arg
  99. if (++i >= nArgs) {
  100. throw new CommandParseException("Expected filename after '-i'");
  101. }
  102. arg = args[i];
  103. if (inputFile != null) {
  104. throw new CommandParseException("Only one input file can be supplied");
  105. }
  106. inputFile = new File(arg);
  107. if (!inputFile.exists()) {
  108. throw new CommandParseException("Specified input file '" + arg + "' does not exist");
  109. }
  110. if (inputFile.isDirectory()) {
  111. throw new CommandParseException("Specified input file '" + arg + "' is a directory");
  112. }
  113. continue;
  114. }
  115. if ("--show-sheet-names".equals(arg)) {
  116. showSheetNames = parseBoolArg(args, ++i);
  117. continue;
  118. }
  119. if ("--evaluate-formulas".equals(arg)) {
  120. evaluateFormulas = parseBoolArg(args, ++i);
  121. continue;
  122. }
  123. if ("--show-comments".equals(arg)) {
  124. showCellComments = parseBoolArg(args, ++i);
  125. continue;
  126. }
  127. if ("--show-blanks".equals(arg)) {
  128. showBlankCells = parseBoolArg(args, ++i);
  129. continue;
  130. }
  131. if ("--headers-footers".equals(arg)) {
  132. headersFooters = parseBoolArg(args, ++i);
  133. continue;
  134. }
  135. throw new CommandParseException("Invalid argument '" + arg + "'");
  136. }
  137. _requestHelp = requestHelp;
  138. _inputFile = inputFile;
  139. _showSheetNames = showSheetNames;
  140. _evaluateFormulas = evaluateFormulas;
  141. _showCellComments = showCellComments;
  142. _showBlankCells = showBlankCells;
  143. _headersFooters = headersFooters;
  144. }
  145. private static boolean parseBoolArg(String[] args, int i) throws CommandParseException {
  146. if (i >= args.length) {
  147. throw new CommandParseException("Expected value after '" + args[i-1] + "'");
  148. }
  149. String value = args[i].toUpperCase(Locale.ROOT);
  150. if ("Y".equals(value) || "YES".equals(value) || "ON".equals(value) || "TRUE".equals(value)) {
  151. return true;
  152. }
  153. if ("N".equals(value) || "NO".equals(value) || "OFF".equals(value) || "FALSE".equals(value)) {
  154. return false;
  155. }
  156. throw new CommandParseException("Invalid value '" + args[i] + "' for '" + args[i-1] + "'. Expected 'Y' or 'N'");
  157. }
  158. public boolean isRequestHelp() {
  159. return _requestHelp;
  160. }
  161. public File getInputFile() {
  162. return _inputFile;
  163. }
  164. public boolean shouldShowSheetNames() {
  165. return _showSheetNames;
  166. }
  167. public boolean shouldEvaluateFormulas() {
  168. return _evaluateFormulas;
  169. }
  170. public boolean shouldShowCellComments() {
  171. return _showCellComments;
  172. }
  173. public boolean shouldShowBlankCells() {
  174. return _showBlankCells;
  175. }
  176. public boolean shouldIncludeHeadersFooters() {
  177. return _headersFooters;
  178. }
  179. }
  180. private static void printUsageMessage(PrintStream ps) {
  181. ps.println("Use:");
  182. ps.println(" " + ExcelExtractor.class.getName() + " [<flag> <value> [<flag> <value> [...]]] [-i <filename.xls>]");
  183. ps.println(" -i <filename.xls> specifies input file (default is to use stdin)");
  184. ps.println(" Flags can be set on or off by using the values 'Y' or 'N'.");
  185. ps.println(" Following are available flags and their default values:");
  186. ps.println(" --show-sheet-names Y");
  187. ps.println(" --evaluate-formulas Y");
  188. ps.println(" --show-comments N");
  189. ps.println(" --show-blanks Y");
  190. ps.println(" --headers-footers Y");
  191. }
  192. /**
  193. * Command line extractor.
  194. *
  195. * @param args the command line parameters
  196. *
  197. * @throws IOException if the file can't be read or contains errors
  198. */
  199. public static void main(String[] args) throws IOException {
  200. CommandArgs cmdArgs;
  201. try {
  202. cmdArgs = new CommandArgs(args);
  203. } catch (CommandParseException e) {
  204. System.err.println(e.getMessage());
  205. printUsageMessage(System.err);
  206. System.exit(1);
  207. return; // suppress compiler error
  208. }
  209. if (cmdArgs.isRequestHelp()) {
  210. printUsageMessage(System.out);
  211. return;
  212. }
  213. try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile());
  214. HSSFWorkbook wb = new HSSFWorkbook(is);
  215. ExcelExtractor extractor = new ExcelExtractor(wb)
  216. ) {
  217. extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames());
  218. extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas());
  219. extractor.setIncludeCellComments(cmdArgs.shouldShowCellComments());
  220. extractor.setIncludeBlankCells(cmdArgs.shouldShowBlankCells());
  221. extractor.setIncludeHeadersFooters(cmdArgs.shouldIncludeHeadersFooters());
  222. System.out.println(extractor.getText());
  223. }
  224. }
  225. @Override
  226. public void setIncludeSheetNames(boolean includeSheetNames) {
  227. _includeSheetNames = includeSheetNames;
  228. }
  229. @Override
  230. public void setFormulasNotResults(boolean formulasNotResults) {
  231. _shouldEvaluateFormulas = !formulasNotResults;
  232. }
  233. @Override
  234. public void setIncludeCellComments(boolean includeCellComments) {
  235. _includeCellComments = includeCellComments;
  236. }
  237. /**
  238. * Should blank cells be output? Default is to only
  239. * output cells that are present in the file and are
  240. * non-blank.
  241. *
  242. * @param includeBlankCells {@code true} if blank cells should be included
  243. */
  244. public void setIncludeBlankCells(boolean includeBlankCells) {
  245. _includeBlankCells = includeBlankCells;
  246. }
  247. @Override
  248. public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
  249. _includeHeadersFooters = includeHeadersFooters;
  250. }
  251. @Override
  252. public String getText() {
  253. StringBuilder text = new StringBuilder();
  254. // We don't care about the difference between
  255. // null (missing) and blank cells
  256. _wb.setMissingCellPolicy(MissingCellPolicy.RETURN_BLANK_AS_NULL);
  257. // Process each sheet in turn
  258. for(int i=0;i<_wb.getNumberOfSheets();i++) {
  259. HSSFSheet sheet = _wb.getSheetAt(i);
  260. if(sheet == null) { continue; }
  261. if(_includeSheetNames) {
  262. String name = _wb.getSheetName(i);
  263. if(name != null) {
  264. text.append(name);
  265. text.append("\n");
  266. }
  267. }
  268. // Header text, if there is any
  269. if(_includeHeadersFooters) {
  270. text.append(_extractHeaderFooter(sheet.getHeader()));
  271. }
  272. int firstRow = sheet.getFirstRowNum();
  273. int lastRow = sheet.getLastRowNum();
  274. for(int j=firstRow;j<=lastRow;j++) {
  275. HSSFRow row = sheet.getRow(j);
  276. if(row == null) { continue; }
  277. // Check each cell in turn
  278. int firstCell = row.getFirstCellNum();
  279. int lastCell = row.getLastCellNum();
  280. if(_includeBlankCells) {
  281. firstCell = 0;
  282. }
  283. for(int k=firstCell;k<lastCell;k++) {
  284. HSSFCell cell = row.getCell(k);
  285. boolean outputContents = true;
  286. if(cell == null) {
  287. // Only output if requested
  288. outputContents = _includeBlankCells;
  289. } else {
  290. switch(cell.getCellType()) {
  291. case STRING:
  292. text.append(cell.getRichStringCellValue().getString());
  293. break;
  294. case NUMERIC:
  295. text.append(_formatter.formatCellValue(cell));
  296. break;
  297. case BOOLEAN:
  298. text.append(cell.getBooleanCellValue());
  299. break;
  300. case ERROR:
  301. text.append(ErrorEval.getText(cell.getErrorCellValue()));
  302. break;
  303. case FORMULA:
  304. if(!_shouldEvaluateFormulas) {
  305. text.append(cell.getCellFormula());
  306. } else {
  307. switch(cell.getCachedFormulaResultType()) {
  308. case STRING:
  309. HSSFRichTextString str = cell.getRichStringCellValue();
  310. if(str != null && str.length() > 0) {
  311. text.append(str);
  312. }
  313. break;
  314. case NUMERIC:
  315. HSSFCellStyle style = cell.getCellStyle();
  316. double nVal = cell.getNumericCellValue();
  317. short df = style.getDataFormat();
  318. String dfs = style.getDataFormatString();
  319. text.append(_formatter.formatRawCellContents(nVal, df, dfs));
  320. break;
  321. case BOOLEAN:
  322. text.append(cell.getBooleanCellValue());
  323. break;
  324. case ERROR:
  325. text.append(ErrorEval.getText(cell.getErrorCellValue()));
  326. break;
  327. default:
  328. throw new IllegalStateException("Unexpected cell cached formula result type: " + cell.getCachedFormulaResultType());
  329. }
  330. }
  331. break;
  332. default:
  333. throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")");
  334. }
  335. // Output the comment, if requested and exists
  336. HSSFComment comment = cell.getCellComment();
  337. if(_includeCellComments && comment != null) {
  338. // Replace any newlines with spaces, otherwise it
  339. // breaks the output
  340. String commentText = comment.getString().getString().replace('\n', ' ');
  341. text.append(" Comment by ").append(comment.getAuthor()).append(": ").append(commentText);
  342. }
  343. }
  344. // Output a tab if we're not on the last cell
  345. if(outputContents && k < (lastCell-1)) {
  346. text.append("\t");
  347. }
  348. }
  349. // Finish off the row
  350. text.append("\n");
  351. }
  352. // Finally Footer text, if there is any
  353. if(_includeHeadersFooters) {
  354. text.append(_extractHeaderFooter(sheet.getFooter()));
  355. }
  356. }
  357. return text.toString();
  358. }
  359. public static String _extractHeaderFooter(HeaderFooter hf) {
  360. StringBuilder text = new StringBuilder();
  361. if(hf.getLeft() != null) {
  362. text.append(hf.getLeft());
  363. }
  364. if(hf.getCenter() != null) {
  365. if(text.length() > 0)
  366. text.append("\t");
  367. text.append(hf.getCenter());
  368. }
  369. if(hf.getRight() != null) {
  370. if(text.length() > 0)
  371. text.append("\t");
  372. text.append(hf.getRight());
  373. }
  374. if(text.length() > 0)
  375. text.append("\n");
  376. return text.toString();
  377. }
  378. @Override
  379. public HSSFWorkbook getDocument() {
  380. return _wb;
  381. }
  382. @Override
  383. public void setCloseFilesystem(boolean doCloseFilesystem) {
  384. this.doCloseFilesystem = doCloseFilesystem;
  385. }
  386. @Override
  387. public boolean isCloseFilesystem() {
  388. return doCloseFilesystem;
  389. }
  390. @Override
  391. public HSSFWorkbook getFilesystem() {
  392. return _wb;
  393. }
  394. }