From: Nick Burch <nick@apache.org>
Date: Sat, 27 Oct 2007 21:57:10 +0000 (+0000)
Subject: Implement an Excel text extractor, and put all the existing text extractors under... 
X-Git-Tag: REL_3_0_2_BETA1~21
X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=6a72c5656aa8a5e70ead0c4a00d41c9c81b171cc;p=poi.git

Implement an Excel text extractor, and put all the existing text extractors under a common superclass, so they're easier to find and use

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589224 13f79535-47bb-0310-9956-ffa450edef68
---

diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java
new file mode 100644
index 0000000000..3ba71880eb
--- /dev/null
+++ b/src/java/org/apache/poi/POITextExtractor.java
@@ -0,0 +1,49 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi;
+
+/**
+ * Common Parent for Text Extractors
+ *  of POI Documents. 
+ * You will typically find the implementation of
+ *  a given format's text extractor under
+ *  org.apache.poi.[format].extractor .
+ * @see org.apache.poi.hssf.extractor.ExcelExtractor
+ * @see org.apache.poi.hslf.extractor.PowerPointExtractor
+ * @see org.apache.poi.hdgf.extractor.VisioTextExtractor
+ * @see org.apache.poi.hwpf.extractor.WordExtractor
+ */
+public abstract class POITextExtractor {
+	/** The POIDocument that's open */
+	protected POIDocument document;
+
+	/**
+	 * Creates a new text extractor for the given document
+	 */
+	public POITextExtractor(POIDocument document) {
+		this.document = document;
+	}
+	
+	/**
+	 * Retrieves all the text from the document.
+	 * How cells, paragraphs etc are separated in the text
+	 *  is implementation specific - see the javadocs for
+	 *  a specific project for details.
+	 * @return All the text from the document
+	 */
+	public abstract String getText();
+}
diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
new file mode 100644
index 0000000000..f45f54dff1
--- /dev/null
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@@ -0,0 +1,144 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.IOException;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRichTextString;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * A text extractor for Excel files.
+ * Returns the textual content of the file, suitable for 
+ *  indexing by something like Lucene, but not really
+ *  intended for display to the user.
+ * To turn an excel file into a CSV or similar, then see
+ *  the XLS2CSVmra example
+ * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
+ */
+public class ExcelExtractor extends POITextExtractor{
+	private HSSFWorkbook wb;
+	private boolean includeSheetNames = true;
+	private boolean formulasNotResults = false;
+	
+	public ExcelExtractor(HSSFWorkbook wb) {
+		super(wb);
+		this.wb = wb;
+	}
+	public ExcelExtractor(POIFSFileSystem fs) throws IOException {
+		this(new HSSFWorkbook(fs));
+	}
+	
+
+	/**
+	 * Should sheet names be included? Default is true
+	 */
+	public void setIncludeSheetNames(boolean includeSheetNames) {
+		this.includeSheetNames = includeSheetNames;
+	}
+	/**
+	 * Should we return the formula itself, and not
+	 *  the result it produces? Default is false
+	 */
+	public void setFormulasNotResults(boolean formulasNotResults) {
+		this.formulasNotResults = formulasNotResults;
+	}
+	
+	/**
+	 * Retreives the text contents of the file
+	 */
+	public String getText() {
+		StringBuffer text = new StringBuffer();
+		
+		for(int i=0;i<wb.getNumberOfSheets();i++) {
+			HSSFSheet sheet = wb.getSheetAt(i);
+			if(sheet == null) { continue; }
+			
+			if(includeSheetNames) {
+				String name = wb.getSheetName(i);
+				if(name != null) {
+					text.append(name);
+					text.append("\n");
+				}
+			}
+			
+			int firstRow = sheet.getFirstRowNum();
+			int lastRow = sheet.getLastRowNum();
+			for(int j=firstRow;j<=lastRow;j++) {
+				HSSFRow row = sheet.getRow(j);
+				if(row == null) { continue; }
+
+				// Check each cell in turn
+				int firstCell = row.getFirstCellNum();
+				int lastCell = row.getLastCellNum();
+				for(int k=firstCell;k<lastCell;k++) {
+					HSSFCell cell = row.getCell((short)k);
+					boolean outputContents = false;
+					if(cell == null) { continue; }
+					
+					switch(cell.getCellType()) {
+						case HSSFCell.CELL_TYPE_STRING:
+							text.append(cell.getRichStringCellValue().getString());
+							outputContents = true;
+							break;
+						case HSSFCell.CELL_TYPE_NUMERIC:
+							// Note - we don't apply any formatting!
+							text.append(cell.getNumericCellValue());
+							outputContents = true;
+							break;
+						case HSSFCell.CELL_TYPE_BOOLEAN:
+							text.append(cell.getBooleanCellValue());
+							outputContents = true;
+							break;
+						case HSSFCell.CELL_TYPE_FORMULA:
+							if(formulasNotResults) {
+								text.append(cell.getCellFormula());
+							} else {
+								// Try it as a string, if not as a number
+								HSSFRichTextString str = 
+									cell.getRichStringCellValue();
+								if(str != null && str.length() > 0) {
+									text.append(str.toString());
+								} else {
+									// Try and treat it as a number
+									double val = cell.getNumericCellValue();
+									text.append(val);
+								}
+							}
+							outputContents = true;
+							break;
+					}
+					
+					// Output a tab if we're not on the last cell
+					if(outputContents && k < (lastCell-1)) {
+						text.append("\t");
+					}
+				}
+				
+				// Finish off the row
+				text.append("\n");
+			}
+		}
+		
+		return text.toString();
+	}
+}
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
index b2c4ee37f9..a7857e46f2 100644
--- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 
+import org.apache.poi.POITextExtractor;
 import org.apache.poi.hdgf.HDGFDiagram;
 import org.apache.poi.hdgf.chunks.Chunk.Command;
 import org.apache.poi.hdgf.streams.ChunkStream;
@@ -33,11 +34,12 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  * Can opperate on the command line (outputs to stdout), or
  *  can return the text for you (eg for use with Lucene).
  */
-public class VisioTextExtractor {
+public class VisioTextExtractor extends POITextExtractor {
 	private HDGFDiagram hdgf;
 	private POIFSFileSystem fs;
 
 	public VisioTextExtractor(HDGFDiagram hdgf) {
+		super(hdgf);
 		this.hdgf = hdgf;
 	}
 	public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
@@ -84,6 +86,8 @@ public class VisioTextExtractor {
 	
 	/**
 	 * Returns the textual contents of the file.
+	 * Each textual object's text will be separated
+	 *  by a newline
 	 */
 	public String getText() {
 		StringBuffer text = new StringBuffer();
diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
index e393b4620b..0fc6f5e847 100644
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@@ -22,6 +22,8 @@ package org.apache.poi.hslf.extractor;
 
 import java.io.*;
 import java.util.HashSet;
+
+import org.apache.poi.POITextExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.hslf.*;
 import org.apache.poi.hslf.model.*;
@@ -34,12 +36,12 @@ import org.apache.poi.hslf.usermodel.*;
  * @author Nick Burch
  */
 
-public class PowerPointExtractor
+public class PowerPointExtractor extends POITextExtractor
 {
-  private HSLFSlideShow _hslfshow;
-  private SlideShow _show;
-  private Slide[] _slides;
-  private Notes[] _notes;
+	private HSLFSlideShow _hslfshow;
+	private SlideShow _show;
+	private Slide[] _slides;
+	private Notes[] _notes;
 
   /**
    * Basic extractor. Returns all the text, and optionally all the notes
@@ -66,61 +68,50 @@ public class PowerPointExtractor
 	ppe.close();
   }
 
-  /**
-   * Creates a PowerPointExtractor, from a file
-   * @param fileName The name of the file to extract from
-   */
-  public PowerPointExtractor(String fileName) throws IOException {
-	_hslfshow = new HSLFSlideShow(fileName);
-	_show = new SlideShow(_hslfshow);
-	_slides = _show.getSlides();
-	_notes = _show.getNotes();
-  }
-
-  /**
-   * Creates a PowerPointExtractor, from an Input Stream
-   * @param iStream The input stream containing the PowerPoint document
-   */
-  public PowerPointExtractor(InputStream iStream) throws IOException {
-	_hslfshow = new HSLFSlideShow(iStream);
-	_show = new SlideShow(_hslfshow);
-	_slides = _show.getSlides();
-	_notes = _show.getNotes();
-  }
-
-  /**
-   * Creates a PowerPointExtractor, from an open POIFSFileSystem
-   * @param fs the POIFSFileSystem containing the PowerPoint document
-   */
-  public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
-	_hslfshow = new HSLFSlideShow(fs);
-	_show = new SlideShow(_hslfshow);
-	_slides = _show.getSlides();
-	_notes = _show.getNotes();
-  }
-
-  /**
-   * Creates a PowerPointExtractor, from a HSLFSlideShow
-   * @param ss the HSLFSlideShow to extract text from
-   */
-  public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
-	_hslfshow = ss;
-	_show = new SlideShow(_hslfshow);
-	_slides = _show.getSlides();
-	_notes = _show.getNotes();
-  }
+	/**
+	 * Creates a PowerPointExtractor, from a file
+	 * @param fileName The name of the file to extract from
+	 */
+	public PowerPointExtractor(String fileName) throws IOException {
+		this(new FileInputStream(fileName));
+	}
+	/**
+	 * Creates a PowerPointExtractor, from an Input Stream
+	 * @param iStream The input stream containing the PowerPoint document
+	 */
+	public PowerPointExtractor(InputStream iStream) throws IOException {
+		this(new POIFSFileSystem(iStream));
+	}
+	/**
+	 * Creates a PowerPointExtractor, from an open POIFSFileSystem
+	 * @param fs the POIFSFileSystem containing the PowerPoint document
+	 */
+	public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
+		this(new HSLFSlideShow(fs));
+	}
 
+	/**
+	 * Creates a PowerPointExtractor, from a HSLFSlideShow
+	 * @param ss the HSLFSlideShow to extract text from
+	 */
+	public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
+		super(ss);
+		_hslfshow = ss;
+		_show = new SlideShow(_hslfshow);
+		_slides = _show.getSlides();
+		_notes = _show.getNotes();
+	}
 
-  /**
-   * Shuts down the underlying streams
-   */
-  public void close() throws IOException {
-	_hslfshow.close();
-	_hslfshow = null;
-	_show = null;
-	_slides = null;
-	_notes = null;
-  }
+	/**
+	 * Shuts down the underlying streams
+	 */
+	public void close() throws IOException {
+		_hslfshow.close();
+		_hslfshow = null;
+		_show = null;
+		_slides = null;
+		_notes = null;
+	}
 
 
   /**
@@ -195,4 +186,4 @@ public class PowerPointExtractor
 
 	return ret.toString();
   }
-}
+}
\ No newline at end of file
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
index dac3a969dd..6f15ee1f9a 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.Iterator;
 
+import org.apache.poi.POITextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.Paragraph;
@@ -36,7 +37,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  *
  * @author Nick Burch (nick at torchbox dot com)
  */
-public class WordExtractor {
+public class WordExtractor extends POITextExtractor {
 	private POIFSFileSystem fs;
 	private HWPFDocument doc;
 	
@@ -62,6 +63,7 @@ public class WordExtractor {
 	 * @param doc The HWPFDocument to extract from
 	 */
 	public WordExtractor(HWPFDocument doc) throws IOException {
+		super(doc);
 		this.doc = doc;
 	}
 
diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
new file mode 100644
index 0000000000..027495a1b0
--- /dev/null
+++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
@@ -0,0 +1,101 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import junit.framework.TestCase;
+
+public class TestExcelExtractor extends TestCase {
+	public void testSimple() throws Exception {
+		String path = System.getProperty("HSSF.testdata.path");
+		FileInputStream fin = new FileInputStream(path + File.separator + "Simple.xls");
+		
+		ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+		
+		assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText());
+		
+		// Now turn off sheet names
+		extractor.setIncludeSheetNames(false);
+		assertEquals("replaceMe\n", extractor.getText());
+	}
+	
+	public void testNumericFormula() throws Exception {
+		String path = System.getProperty("HSSF.testdata.path");
+		FileInputStream fin = new FileInputStream(path + File.separator + "sumifformula.xls");
+		
+		ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+		
+		assertEquals(
+				"Sheet1\n" +
+				"1000.0\t1.0\t5.0\n" +
+				"2000.0\t2.0\t\n" +	
+				"3000.0\t3.0\t\n" +
+				"4000.0\t4.0\t\n" + 
+				"5000.0\t5.0\t\n" +
+				"Sheet2\nSheet3\n", 
+				extractor.getText()
+		);
+		
+		extractor.setFormulasNotResults(true);
+		
+		assertEquals(
+				"Sheet1\n" +
+				"1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
+				"2000.0\t2.0\t\n" +	
+				"3000.0\t3.0\t\n" +
+				"4000.0\t4.0\t\n" + 
+				"5000.0\t5.0\t\n" +
+				"Sheet2\nSheet3\n", 
+				extractor.getText()
+		);
+	}
+	
+	
+	public void testStringConcat() throws Exception {
+		String path = System.getProperty("HSSF.testdata.path");
+		FileInputStream fin = new FileInputStream(path + File.separator + "SimpleWithFormula.xls");
+		
+		ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+		
+		// Comes out as NaN if treated as a number
+		// And as XYZ if treated as a string
+		assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText());
+		
+		extractor.setFormulasNotResults(true);
+		
+		assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText());
+	}
+	
+	public void testStringFormula() throws Exception {
+		String path = System.getProperty("HSSF.testdata.path");
+		FileInputStream fin = new FileInputStream(path + File.separator + "StringFormulas.xls");
+		
+		ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+		
+		// Comes out as NaN if treated as a number
+		// And as XYZ if treated as a string
+		assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText());
+		
+		extractor.setFormulasNotResults(true);
+		
+		assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
+	}
+}