--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi;
+
+/**
+ * Common Parent for Text Extractors
+ * of POI Documents.
+ * You will typically find the implementation of
+ * a given format's text extractor under
+ * org.apache.poi.[format].extractor .
+ * @see org.apache.poi.hssf.extractor.ExcelExtractor
+ * @see org.apache.poi.hslf.extractor.PowerPointExtractor
+ * @see org.apache.poi.hdgf.extractor.VisioTextExtractor
+ * @see org.apache.poi.hwpf.extractor.WordExtractor
+ */
+public abstract class POITextExtractor {
+ /** The POIDocument that's open */
+ protected POIDocument document;
+
+ /**
+ * Creates a new text extractor for the given document
+ */
+ public POITextExtractor(POIDocument document) {
+ this.document = document;
+ }
+
+ /**
+ * Retrieves all the text from the document.
+ * How cells, paragraphs etc are separated in the text
+ * is implementation specific - see the javadocs for
+ * a specific project for details.
+ * @return All the text from the document
+ */
+ public abstract String getText();
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.IOException;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRichTextString;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * A text extractor for Excel files.
+ * Returns the textual content of the file, suitable for
+ * indexing by something like Lucene, but not really
+ * intended for display to the user.
+ * To turn an excel file into a CSV or similar, then see
+ * the XLS2CSVmra example
+ * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
+ */
+public class ExcelExtractor extends POITextExtractor{
+ private HSSFWorkbook wb;
+ private boolean includeSheetNames = true;
+ private boolean formulasNotResults = false;
+
+ public ExcelExtractor(HSSFWorkbook wb) {
+ super(wb);
+ this.wb = wb;
+ }
+ public ExcelExtractor(POIFSFileSystem fs) throws IOException {
+ this(new HSSFWorkbook(fs));
+ }
+
+
+ /**
+ * Should sheet names be included? Default is true
+ */
+ public void setIncludeSheetNames(boolean includeSheetNames) {
+ this.includeSheetNames = includeSheetNames;
+ }
+ /**
+ * Should we return the formula itself, and not
+ * the result it produces? Default is false
+ */
+ public void setFormulasNotResults(boolean formulasNotResults) {
+ this.formulasNotResults = formulasNotResults;
+ }
+
+ /**
+ * Retreives the text contents of the file
+ */
+ public String getText() {
+ StringBuffer text = new StringBuffer();
+
+ for(int i=0;i<wb.getNumberOfSheets();i++) {
+ HSSFSheet sheet = wb.getSheetAt(i);
+ if(sheet == null) { continue; }
+
+ if(includeSheetNames) {
+ String name = wb.getSheetName(i);
+ if(name != null) {
+ text.append(name);
+ text.append("\n");
+ }
+ }
+
+ int firstRow = sheet.getFirstRowNum();
+ int lastRow = sheet.getLastRowNum();
+ for(int j=firstRow;j<=lastRow;j++) {
+ HSSFRow row = sheet.getRow(j);
+ if(row == null) { continue; }
+
+ // Check each cell in turn
+ int firstCell = row.getFirstCellNum();
+ int lastCell = row.getLastCellNum();
+ for(int k=firstCell;k<lastCell;k++) {
+ HSSFCell cell = row.getCell((short)k);
+ boolean outputContents = false;
+ if(cell == null) { continue; }
+
+ switch(cell.getCellType()) {
+ case HSSFCell.CELL_TYPE_STRING:
+ text.append(cell.getRichStringCellValue().getString());
+ outputContents = true;
+ break;
+ case HSSFCell.CELL_TYPE_NUMERIC:
+ // Note - we don't apply any formatting!
+ text.append(cell.getNumericCellValue());
+ outputContents = true;
+ break;
+ case HSSFCell.CELL_TYPE_BOOLEAN:
+ text.append(cell.getBooleanCellValue());
+ outputContents = true;
+ break;
+ case HSSFCell.CELL_TYPE_FORMULA:
+ if(formulasNotResults) {
+ text.append(cell.getCellFormula());
+ } else {
+ // Try it as a string, if not as a number
+ HSSFRichTextString str =
+ cell.getRichStringCellValue();
+ if(str != null && str.length() > 0) {
+ text.append(str.toString());
+ } else {
+ // Try and treat it as a number
+ double val = cell.getNumericCellValue();
+ text.append(val);
+ }
+ }
+ outputContents = true;
+ break;
+ }
+
+ // Output a tab if we're not on the last cell
+ if(outputContents && k < (lastCell-1)) {
+ text.append("\t");
+ }
+ }
+
+ // Finish off the row
+ text.append("\n");
+ }
+ }
+
+ return text.toString();
+ }
+}
import java.io.InputStream;
import java.util.ArrayList;
+import org.apache.poi.POITextExtractor;
import org.apache.poi.hdgf.HDGFDiagram;
import org.apache.poi.hdgf.chunks.Chunk.Command;
import org.apache.poi.hdgf.streams.ChunkStream;
* Can opperate on the command line (outputs to stdout), or
* can return the text for you (eg for use with Lucene).
*/
-public class VisioTextExtractor {
+public class VisioTextExtractor extends POITextExtractor {
private HDGFDiagram hdgf;
private POIFSFileSystem fs;
public VisioTextExtractor(HDGFDiagram hdgf) {
+ super(hdgf);
this.hdgf = hdgf;
}
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
/**
* Returns the textual contents of the file.
+ * Each textual object's text will be separated
+ * by a newline
*/
public String getText() {
StringBuffer text = new StringBuffer();
import java.io.*;
import java.util.HashSet;
+
+import org.apache.poi.POITextExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.hslf.*;
import org.apache.poi.hslf.model.*;
* @author Nick Burch
*/
-public class PowerPointExtractor
+public class PowerPointExtractor extends POITextExtractor
{
- private HSLFSlideShow _hslfshow;
- private SlideShow _show;
- private Slide[] _slides;
- private Notes[] _notes;
+ private HSLFSlideShow _hslfshow;
+ private SlideShow _show;
+ private Slide[] _slides;
+ private Notes[] _notes;
/**
* Basic extractor. Returns all the text, and optionally all the notes
ppe.close();
}
- /**
- * Creates a PowerPointExtractor, from a file
- * @param fileName The name of the file to extract from
- */
- public PowerPointExtractor(String fileName) throws IOException {
- _hslfshow = new HSLFSlideShow(fileName);
- _show = new SlideShow(_hslfshow);
- _slides = _show.getSlides();
- _notes = _show.getNotes();
- }
-
- /**
- * Creates a PowerPointExtractor, from an Input Stream
- * @param iStream The input stream containing the PowerPoint document
- */
- public PowerPointExtractor(InputStream iStream) throws IOException {
- _hslfshow = new HSLFSlideShow(iStream);
- _show = new SlideShow(_hslfshow);
- _slides = _show.getSlides();
- _notes = _show.getNotes();
- }
-
- /**
- * Creates a PowerPointExtractor, from an open POIFSFileSystem
- * @param fs the POIFSFileSystem containing the PowerPoint document
- */
- public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
- _hslfshow = new HSLFSlideShow(fs);
- _show = new SlideShow(_hslfshow);
- _slides = _show.getSlides();
- _notes = _show.getNotes();
- }
-
- /**
- * Creates a PowerPointExtractor, from a HSLFSlideShow
- * @param ss the HSLFSlideShow to extract text from
- */
- public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
- _hslfshow = ss;
- _show = new SlideShow(_hslfshow);
- _slides = _show.getSlides();
- _notes = _show.getNotes();
- }
+ /**
+ * Creates a PowerPointExtractor, from a file
+ * @param fileName The name of the file to extract from
+ */
+ public PowerPointExtractor(String fileName) throws IOException {
+ this(new FileInputStream(fileName));
+ }
+ /**
+ * Creates a PowerPointExtractor, from an Input Stream
+ * @param iStream The input stream containing the PowerPoint document
+ */
+ public PowerPointExtractor(InputStream iStream) throws IOException {
+ this(new POIFSFileSystem(iStream));
+ }
+ /**
+ * Creates a PowerPointExtractor, from an open POIFSFileSystem
+ * @param fs the POIFSFileSystem containing the PowerPoint document
+ */
+ public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
+ this(new HSLFSlideShow(fs));
+ }
+ /**
+ * Creates a PowerPointExtractor, from a HSLFSlideShow
+ * @param ss the HSLFSlideShow to extract text from
+ */
+ public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
+ super(ss);
+ _hslfshow = ss;
+ _show = new SlideShow(_hslfshow);
+ _slides = _show.getSlides();
+ _notes = _show.getNotes();
+ }
- /**
- * Shuts down the underlying streams
- */
- public void close() throws IOException {
- _hslfshow.close();
- _hslfshow = null;
- _show = null;
- _slides = null;
- _notes = null;
- }
+ /**
+ * Shuts down the underlying streams
+ */
+ public void close() throws IOException {
+ _hslfshow.close();
+ _hslfshow = null;
+ _show = null;
+ _slides = null;
+ _notes = null;
+ }
/**
return ret.toString();
}
-}
+}
\ No newline at end of file
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
+import org.apache.poi.POITextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
*
* @author Nick Burch (nick at torchbox dot com)
*/
-public class WordExtractor {
+public class WordExtractor extends POITextExtractor {
private POIFSFileSystem fs;
private HWPFDocument doc;
* @param doc The HWPFDocument to extract from
*/
public WordExtractor(HWPFDocument doc) throws IOException {
+ super(doc);
this.doc = doc;
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import junit.framework.TestCase;
+
+public class TestExcelExtractor extends TestCase {
+ public void testSimple() throws Exception {
+ String path = System.getProperty("HSSF.testdata.path");
+ FileInputStream fin = new FileInputStream(path + File.separator + "Simple.xls");
+
+ ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+
+ assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText());
+
+ // Now turn off sheet names
+ extractor.setIncludeSheetNames(false);
+ assertEquals("replaceMe\n", extractor.getText());
+ }
+
+ public void testNumericFormula() throws Exception {
+ String path = System.getProperty("HSSF.testdata.path");
+ FileInputStream fin = new FileInputStream(path + File.separator + "sumifformula.xls");
+
+ ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+
+ assertEquals(
+ "Sheet1\n" +
+ "1000.0\t1.0\t5.0\n" +
+ "2000.0\t2.0\t\n" +
+ "3000.0\t3.0\t\n" +
+ "4000.0\t4.0\t\n" +
+ "5000.0\t5.0\t\n" +
+ "Sheet2\nSheet3\n",
+ extractor.getText()
+ );
+
+ extractor.setFormulasNotResults(true);
+
+ assertEquals(
+ "Sheet1\n" +
+ "1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
+ "2000.0\t2.0\t\n" +
+ "3000.0\t3.0\t\n" +
+ "4000.0\t4.0\t\n" +
+ "5000.0\t5.0\t\n" +
+ "Sheet2\nSheet3\n",
+ extractor.getText()
+ );
+ }
+
+
+ public void testStringConcat() throws Exception {
+ String path = System.getProperty("HSSF.testdata.path");
+ FileInputStream fin = new FileInputStream(path + File.separator + "SimpleWithFormula.xls");
+
+ ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+
+ // Comes out as NaN if treated as a number
+ // And as XYZ if treated as a string
+ assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText());
+
+ extractor.setFormulasNotResults(true);
+
+ assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText());
+ }
+
+ public void testStringFormula() throws Exception {
+ String path = System.getProperty("HSSF.testdata.path");
+ FileInputStream fin = new FileInputStream(path + File.separator + "StringFormulas.xls");
+
+ ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+
+ // Comes out as NaN if treated as a number
+ // And as XYZ if treated as a string
+ assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText());
+
+ extractor.setFormulasNotResults(true);
+
+ assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
+ }
+}