123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.hssf.extractor;
-
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.PrintStream;
- import java.util.Locale;
-
- import org.apache.poi.extractor.POIOLE2TextExtractor;
- import org.apache.poi.hssf.usermodel.HSSFCell;
- import org.apache.poi.hssf.usermodel.HSSFCellStyle;
- import org.apache.poi.hssf.usermodel.HSSFComment;
- import org.apache.poi.hssf.usermodel.HSSFDataFormatter;
- import org.apache.poi.hssf.usermodel.HSSFRichTextString;
- import org.apache.poi.hssf.usermodel.HSSFRow;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.ss.formula.eval.ErrorEval;
- import org.apache.poi.ss.usermodel.HeaderFooter;
- import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;
-
- /**
- * A text extractor for Excel files.
- * <p>
- * Returns the textual content of the file, suitable for
- * indexing by something like Lucene, but not really
- * intended for display to the user.
- * </p>
- * <p>
- * To turn an excel file into a CSV or similar, then see
- * the XLS2CSVmra example
- * </p>
- *
- * @see <a href="http://svn.apache.org/repos/asf/poi/trunk/poi-examples/src/main/java/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
- */
- public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
- private final HSSFWorkbook _wb;
- private final HSSFDataFormatter _formatter;
- private boolean doCloseFilesystem = true;
- private boolean _includeSheetNames = true;
- private boolean _shouldEvaluateFormulas = true;
- private boolean _includeCellComments;
- private boolean _includeBlankCells;
- private boolean _includeHeadersFooters = true;
-
- public ExcelExtractor(HSSFWorkbook wb) {
- _wb = wb;
- _formatter = new HSSFDataFormatter();
- }
-
- public ExcelExtractor(POIFSFileSystem fs) throws IOException {
- this(fs.getRoot());
- }
-
- public ExcelExtractor(DirectoryNode dir) throws IOException {
- this(new HSSFWorkbook(dir, true));
- }
-
- private static final class CommandParseException extends Exception {
- public CommandParseException(String msg) {
- super(msg);
- }
- }
- private static final class CommandArgs {
- private final boolean _requestHelp;
- private final File _inputFile;
- private final boolean _showSheetNames;
- private final boolean _evaluateFormulas;
- private final boolean _showCellComments;
- private final boolean _showBlankCells;
- private final boolean _headersFooters;
- public CommandArgs(String[] args) throws CommandParseException {
- int nArgs = args.length;
- File inputFile = null;
- boolean requestHelp = false;
- boolean showSheetNames = true;
- boolean evaluateFormulas = true;
- boolean showCellComments = false;
- boolean showBlankCells = false;
- boolean headersFooters = true;
- for (int i=0; i<nArgs; i++) {
- String arg = args[i];
- if ("-help".equalsIgnoreCase(arg)) {
- requestHelp = true;
- break;
- }
- if ("-i".equals(arg)) {
- // step to next arg
- if (++i >= nArgs) {
- throw new CommandParseException("Expected filename after '-i'");
- }
- arg = args[i];
- if (inputFile != null) {
- throw new CommandParseException("Only one input file can be supplied");
- }
- inputFile = new File(arg);
- if (!inputFile.exists()) {
- throw new CommandParseException("Specified input file '" + arg + "' does not exist");
- }
- if (inputFile.isDirectory()) {
- throw new CommandParseException("Specified input file '" + arg + "' is a directory");
- }
- continue;
- }
- if ("--show-sheet-names".equals(arg)) {
- showSheetNames = parseBoolArg(args, ++i);
- continue;
- }
- if ("--evaluate-formulas".equals(arg)) {
- evaluateFormulas = parseBoolArg(args, ++i);
- continue;
- }
- if ("--show-comments".equals(arg)) {
- showCellComments = parseBoolArg(args, ++i);
- continue;
- }
- if ("--show-blanks".equals(arg)) {
- showBlankCells = parseBoolArg(args, ++i);
- continue;
- }
- if ("--headers-footers".equals(arg)) {
- headersFooters = parseBoolArg(args, ++i);
- continue;
- }
- throw new CommandParseException("Invalid argument '" + arg + "'");
- }
- _requestHelp = requestHelp;
- _inputFile = inputFile;
- _showSheetNames = showSheetNames;
- _evaluateFormulas = evaluateFormulas;
- _showCellComments = showCellComments;
- _showBlankCells = showBlankCells;
- _headersFooters = headersFooters;
- }
- private static boolean parseBoolArg(String[] args, int i) throws CommandParseException {
- if (i >= args.length) {
- throw new CommandParseException("Expected value after '" + args[i-1] + "'");
- }
- String value = args[i].toUpperCase(Locale.ROOT);
- if ("Y".equals(value) || "YES".equals(value) || "ON".equals(value) || "TRUE".equals(value)) {
- return true;
- }
- if ("N".equals(value) || "NO".equals(value) || "OFF".equals(value) || "FALSE".equals(value)) {
- return false;
- }
- throw new CommandParseException("Invalid value '" + args[i] + "' for '" + args[i-1] + "'. Expected 'Y' or 'N'");
- }
- public boolean isRequestHelp() {
- return _requestHelp;
- }
- public File getInputFile() {
- return _inputFile;
- }
- public boolean shouldShowSheetNames() {
- return _showSheetNames;
- }
- public boolean shouldEvaluateFormulas() {
- return _evaluateFormulas;
- }
- public boolean shouldShowCellComments() {
- return _showCellComments;
- }
- public boolean shouldShowBlankCells() {
- return _showBlankCells;
- }
- public boolean shouldIncludeHeadersFooters() {
- return _headersFooters;
- }
- }
-
- private static void printUsageMessage(PrintStream ps) {
- ps.println("Use:");
- ps.println(" " + ExcelExtractor.class.getName() + " [<flag> <value> [<flag> <value> [...]]] [-i <filename.xls>]");
- ps.println(" -i <filename.xls> specifies input file (default is to use stdin)");
- ps.println(" Flags can be set on or off by using the values 'Y' or 'N'.");
- ps.println(" Following are available flags and their default values:");
- ps.println(" --show-sheet-names Y");
- ps.println(" --evaluate-formulas Y");
- ps.println(" --show-comments N");
- ps.println(" --show-blanks Y");
- ps.println(" --headers-footers Y");
- }
-
- /**
- * Command line extractor.
- *
- * @param args the command line parameters
- *
- * @throws IOException if the file can't be read or contains errors
- */
- public static void main(String[] args) throws IOException {
-
- CommandArgs cmdArgs;
- try {
- cmdArgs = new CommandArgs(args);
- } catch (CommandParseException e) {
- System.err.println(e.getMessage());
- printUsageMessage(System.err);
- System.exit(1);
- return; // suppress compiler error
- }
-
- if (cmdArgs.isRequestHelp()) {
- printUsageMessage(System.out);
- return;
- }
-
- try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile());
- HSSFWorkbook wb = new HSSFWorkbook(is);
- ExcelExtractor extractor = new ExcelExtractor(wb)
- ) {
- extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames());
- extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas());
- extractor.setIncludeCellComments(cmdArgs.shouldShowCellComments());
- extractor.setIncludeBlankCells(cmdArgs.shouldShowBlankCells());
- extractor.setIncludeHeadersFooters(cmdArgs.shouldIncludeHeadersFooters());
- System.out.println(extractor.getText());
- }
- }
-
- @Override
- public void setIncludeSheetNames(boolean includeSheetNames) {
- _includeSheetNames = includeSheetNames;
- }
-
- @Override
- public void setFormulasNotResults(boolean formulasNotResults) {
- _shouldEvaluateFormulas = !formulasNotResults;
- }
-
- @Override
- public void setIncludeCellComments(boolean includeCellComments) {
- _includeCellComments = includeCellComments;
- }
-
- /**
- * Should blank cells be output? Default is to only
- * output cells that are present in the file and are
- * non-blank.
- *
- * @param includeBlankCells {@code true} if blank cells should be included
- */
- public void setIncludeBlankCells(boolean includeBlankCells) {
- _includeBlankCells = includeBlankCells;
- }
-
- @Override
- public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
- _includeHeadersFooters = includeHeadersFooters;
- }
-
- @Override
- public String getText() {
- StringBuilder text = new StringBuilder();
-
- // We don't care about the difference between
- // null (missing) and blank cells
- _wb.setMissingCellPolicy(MissingCellPolicy.RETURN_BLANK_AS_NULL);
-
- // Process each sheet in turn
- for(int i=0;i<_wb.getNumberOfSheets();i++) {
- HSSFSheet sheet = _wb.getSheetAt(i);
- if(sheet == null) { continue; }
-
- if(_includeSheetNames) {
- String name = _wb.getSheetName(i);
- if(name != null) {
- text.append(name);
- text.append("\n");
- }
- }
-
- // Header text, if there is any
- if(_includeHeadersFooters) {
- text.append(_extractHeaderFooter(sheet.getHeader()));
- }
-
- int firstRow = sheet.getFirstRowNum();
- int lastRow = sheet.getLastRowNum();
- for(int j=firstRow;j<=lastRow;j++) {
- HSSFRow row = sheet.getRow(j);
- if(row == null) { continue; }
-
- // Check each cell in turn
- int firstCell = row.getFirstCellNum();
- int lastCell = row.getLastCellNum();
- if(_includeBlankCells) {
- firstCell = 0;
- }
-
- for(int k=firstCell;k<lastCell;k++) {
- HSSFCell cell = row.getCell(k);
- boolean outputContents = true;
-
- if(cell == null) {
- // Only output if requested
- outputContents = _includeBlankCells;
- } else {
- switch(cell.getCellType()) {
- case STRING:
- text.append(cell.getRichStringCellValue().getString());
- break;
- case NUMERIC:
- text.append(_formatter.formatCellValue(cell));
- break;
- case BOOLEAN:
- text.append(cell.getBooleanCellValue());
- break;
- case ERROR:
- text.append(ErrorEval.getText(cell.getErrorCellValue()));
- break;
- case FORMULA:
- if(!_shouldEvaluateFormulas) {
- text.append(cell.getCellFormula());
- } else {
- switch(cell.getCachedFormulaResultType()) {
- case STRING:
- HSSFRichTextString str = cell.getRichStringCellValue();
- if(str != null && str.length() > 0) {
- text.append(str);
- }
- break;
- case NUMERIC:
- HSSFCellStyle style = cell.getCellStyle();
- double nVal = cell.getNumericCellValue();
- short df = style.getDataFormat();
- String dfs = style.getDataFormatString();
- text.append(_formatter.formatRawCellContents(nVal, df, dfs));
- break;
- case BOOLEAN:
- text.append(cell.getBooleanCellValue());
- break;
- case ERROR:
- text.append(ErrorEval.getText(cell.getErrorCellValue()));
- break;
- default:
- throw new IllegalStateException("Unexpected cell cached formula result type: " + cell.getCachedFormulaResultType());
-
- }
- }
- break;
- default:
- throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")");
- }
-
- // Output the comment, if requested and exists
- HSSFComment comment = cell.getCellComment();
- if(_includeCellComments && comment != null) {
- // Replace any newlines with spaces, otherwise it
- // breaks the output
- String commentText = comment.getString().getString().replace('\n', ' ');
- text.append(" Comment by ").append(comment.getAuthor()).append(": ").append(commentText);
- }
- }
-
- // Output a tab if we're not on the last cell
- if(outputContents && k < (lastCell-1)) {
- text.append("\t");
- }
- }
-
- // Finish off the row
- text.append("\n");
- }
-
- // Finally Footer text, if there is any
- if(_includeHeadersFooters) {
- text.append(_extractHeaderFooter(sheet.getFooter()));
- }
- }
-
- return text.toString();
- }
-
- public static String _extractHeaderFooter(HeaderFooter hf) {
- StringBuilder text = new StringBuilder();
-
- if(hf.getLeft() != null) {
- text.append(hf.getLeft());
- }
- if(hf.getCenter() != null) {
- if(text.length() > 0)
- text.append("\t");
- text.append(hf.getCenter());
- }
- if(hf.getRight() != null) {
- if(text.length() > 0)
- text.append("\t");
- text.append(hf.getRight());
- }
- if(text.length() > 0)
- text.append("\n");
-
- return text.toString();
- }
-
- @Override
- public HSSFWorkbook getDocument() {
- return _wb;
- }
-
- @Override
- public void setCloseFilesystem(boolean doCloseFilesystem) {
- this.doCloseFilesystem = doCloseFilesystem;
- }
-
- @Override
- public boolean isCloseFilesystem() {
- return doCloseFilesystem;
- }
-
- @Override
- public HSSFWorkbook getFilesystem() {
- return _wb;
- }
- }
|