You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

XSSFExcelExtractor.java 10.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.xssf.extractor;
  16. import java.io.IOException;
  17. import java.util.Iterator;
  18. import java.util.Locale;
  19. import org.apache.poi.hssf.extractor.ExcelExtractor;
  20. import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
  21. import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  22. import org.apache.poi.openxml4j.opc.OPCPackage;
  23. import org.apache.poi.ss.usermodel.Cell;
  24. import org.apache.poi.ss.usermodel.CellStyle;
  25. import org.apache.poi.ss.usermodel.CellType;
  26. import org.apache.poi.ss.usermodel.Comment;
  27. import org.apache.poi.ss.usermodel.DataFormatter;
  28. import org.apache.poi.ss.usermodel.HeaderFooter;
  29. import org.apache.poi.ss.usermodel.Row;
  30. import org.apache.poi.ss.usermodel.Sheet;
  31. import org.apache.poi.xssf.usermodel.XSSFCell;
  32. import org.apache.poi.xssf.usermodel.XSSFDrawing;
  33. import org.apache.poi.xssf.usermodel.XSSFRelation;
  34. import org.apache.poi.xssf.usermodel.XSSFShape;
  35. import org.apache.poi.xssf.usermodel.XSSFSheet;
  36. import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
  37. import org.apache.poi.xssf.usermodel.XSSFWorkbook;
  38. import org.apache.xmlbeans.XmlException;
  39. /**
  40. * Helper class to extract text from an OOXML Excel file
  41. */
  42. public class XSSFExcelExtractor
  43. implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
  44. public static final XSSFRelation[] SUPPORTED_TYPES = new XSSFRelation[] {
  45. XSSFRelation.WORKBOOK, XSSFRelation.MACRO_TEMPLATE_WORKBOOK,
  46. XSSFRelation.MACRO_ADDIN_WORKBOOK, XSSFRelation.TEMPLATE_WORKBOOK,
  47. XSSFRelation.MACROS_WORKBOOK
  48. };
  49. private Locale locale;
  50. private final XSSFWorkbook workbook;
  51. private boolean includeSheetNames = true;
  52. private boolean formulasNotResults;
  53. private boolean includeCellComments;
  54. private boolean includeHeadersFooters = true;
  55. private boolean includeTextBoxes = true;
  56. private boolean doCloseFilesystem = true;
  57. public XSSFExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
  58. this(new XSSFWorkbook(container));
  59. }
  60. public XSSFExcelExtractor(XSSFWorkbook workbook) {
  61. this.workbook = workbook;
  62. }
  63. /**
  64. * Should sheet names be included? Default is true
  65. */
  66. public void setIncludeSheetNames(boolean includeSheetNames) {
  67. this.includeSheetNames = includeSheetNames;
  68. }
  69. /**
  70. * Should we return the formula itself, and not
  71. * the result it produces? Default is false
  72. */
  73. public void setFormulasNotResults(boolean formulasNotResults) {
  74. this.formulasNotResults = formulasNotResults;
  75. }
  76. /**
  77. * Should cell comments be included? Default is false
  78. */
  79. public void setIncludeCellComments(boolean includeCellComments) {
  80. this.includeCellComments = includeCellComments;
  81. }
  82. /**
  83. * Should headers and footers be included? Default is true
  84. */
  85. public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
  86. this.includeHeadersFooters = includeHeadersFooters;
  87. }
  88. /**
  89. * Should text within textboxes be included? Default is true
  90. * @param includeTextBoxes True if textboxes should be included, false if not.
  91. */
  92. public void setIncludeTextBoxes(boolean includeTextBoxes){
  93. this.includeTextBoxes = includeTextBoxes;
  94. }
  95. /**
  96. * What Locale should be used for formatting numbers (based
  97. * on the styles applied to the cells)
  98. */
  99. public void setLocale(Locale locale) {
  100. this.locale = locale;
  101. }
  102. /**
  103. * Retrieves the text contents of the file
  104. */
  105. public String getText() {
  106. DataFormatter formatter;
  107. if(locale == null) {
  108. formatter = new DataFormatter();
  109. } else {
  110. formatter = new DataFormatter(locale);
  111. }
  112. StringBuilder text = new StringBuilder(64);
  113. for(Sheet sh : workbook) {
  114. XSSFSheet sheet = (XSSFSheet) sh;
  115. if(includeSheetNames) {
  116. text.append(sheet.getSheetName()).append("\n");
  117. }
  118. // Header(s), if present
  119. if(includeHeadersFooters) {
  120. text.append(
  121. extractHeaderFooter(sheet.getFirstHeader())
  122. );
  123. text.append(
  124. extractHeaderFooter(sheet.getOddHeader())
  125. );
  126. text.append(
  127. extractHeaderFooter(sheet.getEvenHeader())
  128. );
  129. }
  130. // Rows and cells
  131. for (Object rawR : sheet) {
  132. Row row = (Row)rawR;
  133. for(Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
  134. Cell cell = ri.next();
  135. // Is it a formula one?
  136. if(cell.getCellType() == CellType.FORMULA) {
  137. if (formulasNotResults) {
  138. String contents = cell.getCellFormula();
  139. checkMaxTextSize(text, contents);
  140. text.append(contents);
  141. } else {
  142. if (cell.getCachedFormulaResultType() == CellType.STRING) {
  143. handleStringCell(text, cell);
  144. } else {
  145. handleNonStringCell(text, cell, formatter);
  146. }
  147. }
  148. } else if(cell.getCellType() == CellType.STRING) {
  149. handleStringCell(text, cell);
  150. } else {
  151. handleNonStringCell(text, cell, formatter);
  152. }
  153. // Output the comment, if requested and exists
  154. Comment comment = cell.getCellComment();
  155. if(includeCellComments && comment != null) {
  156. // Replace any newlines with spaces, otherwise it
  157. // breaks the output
  158. String commentText = comment.getString().getString().replace('\n', ' ');
  159. checkMaxTextSize(text, commentText);
  160. text.append(" Comment by ").append(comment.getAuthor()).append(": ").append(commentText);
  161. }
  162. if(ri.hasNext()) {
  163. text.append("\t");
  164. }
  165. }
  166. text.append("\n");
  167. }
  168. // add textboxes
  169. if (includeTextBoxes){
  170. XSSFDrawing drawing = sheet.getDrawingPatriarch();
  171. if (drawing != null) {
  172. for (XSSFShape shape : drawing.getShapes()){
  173. if (shape instanceof XSSFSimpleShape){
  174. String boxText = ((XSSFSimpleShape)shape).getText();
  175. if (boxText.length() > 0){
  176. text.append(boxText);
  177. text.append('\n');
  178. }
  179. }
  180. }
  181. }
  182. }
  183. // Finally footer(s), if present
  184. if(includeHeadersFooters) {
  185. text.append(
  186. extractHeaderFooter(sheet.getFirstFooter())
  187. );
  188. text.append(
  189. extractHeaderFooter(sheet.getOddFooter())
  190. );
  191. text.append(
  192. extractHeaderFooter(sheet.getEvenFooter())
  193. );
  194. }
  195. }
  196. return text.toString();
  197. }
  198. private void handleStringCell(StringBuilder text, Cell cell) {
  199. String contents = cell.getRichStringCellValue().getString();
  200. checkMaxTextSize(text, contents);
  201. text.append(contents);
  202. }
  203. private void handleNonStringCell(StringBuilder text, Cell cell, DataFormatter formatter) {
  204. CellType type = cell.getCellType();
  205. if (type == CellType.FORMULA) {
  206. type = cell.getCachedFormulaResultType();
  207. }
  208. if (type == CellType.NUMERIC) {
  209. CellStyle cs = cell.getCellStyle();
  210. if (cs != null && cs.getDataFormatString() != null) {
  211. String contents = formatter.formatRawCellContents(
  212. cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString());
  213. checkMaxTextSize(text, contents);
  214. text.append(contents);
  215. return;
  216. }
  217. }
  218. // No supported styling applies to this cell
  219. String contents = ((XSSFCell)cell).getRawValue();
  220. if (contents != null) {
  221. checkMaxTextSize(text, contents);
  222. text.append(contents);
  223. }
  224. }
  225. private String extractHeaderFooter(HeaderFooter hf) {
  226. return ExcelExtractor._extractHeaderFooter(hf);
  227. }
  228. @Override
  229. public XSSFWorkbook getDocument() {
  230. return workbook;
  231. }
  232. @Override
  233. public void setCloseFilesystem(boolean doCloseFilesystem) {
  234. this.doCloseFilesystem = doCloseFilesystem;
  235. }
  236. @Override
  237. public boolean isCloseFilesystem() {
  238. return doCloseFilesystem;
  239. }
  240. @Override
  241. public XSSFWorkbook getFilesystem() {
  242. return workbook;
  243. }
  244. }