Implement an Excel text extractor, and put all the existing text extractors under...

author Nick Burch <nick@apache.org>

Sat, 27 Oct 2007 21:57:10 +0000 (21:57 +0000)

committer Nick Burch <nick@apache.org>

Sat, 27 Oct 2007 21:57:10 +0000 (21:57 +0000)
author Nick Burch <nick@apache.org>
Sat, 27 Oct 2007 21:57:10 +0000 (21:57 +0000)
committer Nick Burch <nick@apache.org>
Sat, 27 Oct 2007 21:57:10 +0000 (21:57 +0000)
diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java

new file mode 100644 (file)

index 0000000..3ba7188
--- /dev/null
+++ b/src/java/org/apache/poi/POITextExtractor.java
@@ -0,0 +1,49 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi;
+
+/**
+ * Common Parent for Text Extractors
+ *  of POI Documents. 
+ * You will typically find the implementation of
+ *  a given format's text extractor under
+ *  org.apache.poi.[format].extractor .
+ * @see org.apache.poi.hssf.extractor.ExcelExtractor
+ * @see org.apache.poi.hslf.extractor.PowerPointExtractor
+ * @see org.apache.poi.hdgf.extractor.VisioTextExtractor
+ * @see org.apache.poi.hwpf.extractor.WordExtractor
+ */
+public abstract class POITextExtractor {
+       /** The POIDocument that's open */
+       protected POIDocument document;
+
+       /**
+        * Creates a new text extractor for the given document
+        */
+       public POITextExtractor(POIDocument document) {
+               this.document = document;
+       }
+       
+       /**
+        * Retrieves all the text from the document.
+        * How cells, paragraphs etc are separated in the text
+        *  is implementation specific - see the javadocs for
+        *  a specific project for details.
+        * @return All the text from the document
+        */
+       public abstract String getText();
+}
diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java

new file mode 100644 (file)

index 0000000..f45f54d
--- /dev/null
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@@ -0,0 +1,144 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.IOException;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRichTextString;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * A text extractor for Excel files.
+ * Returns the textual content of the file, suitable for 
+ *  indexing by something like Lucene, but not really
+ *  intended for display to the user.
+ * To turn an excel file into a CSV or similar, then see
+ *  the XLS2CSVmra example
+ * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
+ */
+public class ExcelExtractor extends POITextExtractor{
+       private HSSFWorkbook wb;
+       private boolean includeSheetNames = true;
+       private boolean formulasNotResults = false;
+       
+       public ExcelExtractor(HSSFWorkbook wb) {
+               super(wb);
+               this.wb = wb;
+       }
+       public ExcelExtractor(POIFSFileSystem fs) throws IOException {
+               this(new HSSFWorkbook(fs));
+       }
+       
+
+       /**
+        * Should sheet names be included? Default is true
+        */
+       public void setIncludeSheetNames(boolean includeSheetNames) {
+               this.includeSheetNames = includeSheetNames;
+       }
+       /**
+        * Should we return the formula itself, and not
+        *  the result it produces? Default is false
+        */
+       public void setFormulasNotResults(boolean formulasNotResults) {
+               this.formulasNotResults = formulasNotResults;
+       }
+       
+       /**
+        * Retreives the text contents of the file
+        */
+       public String getText() {
+               StringBuffer text = new StringBuffer();
+               
+               for(int i=0;i<wb.getNumberOfSheets();i++) {
+                       HSSFSheet sheet = wb.getSheetAt(i);
+                       if(sheet == null) { continue; }
+                       
+                       if(includeSheetNames) {
+                               String name = wb.getSheetName(i);
+                               if(name != null) {
+                                       text.append(name);
+                                       text.append("\n");
+                               }
+                       }
+                       
+                       int firstRow = sheet.getFirstRowNum();
+                       int lastRow = sheet.getLastRowNum();
+                       for(int j=firstRow;j<=lastRow;j++) {
+                               HSSFRow row = sheet.getRow(j);
+                               if(row == null) { continue; }
+
+                               // Check each cell in turn
+                               int firstCell = row.getFirstCellNum();
+                               int lastCell = row.getLastCellNum();
+                               for(int k=firstCell;k<lastCell;k++) {
+                                       HSSFCell cell = row.getCell((short)k);
+                                       boolean outputContents = false;
+                                       if(cell == null) { continue; }
+                                       
+                                       switch(cell.getCellType()) {
+                                               case HSSFCell.CELL_TYPE_STRING:
+                                                       text.append(cell.getRichStringCellValue().getString());
+                                                       outputContents = true;
+                                                       break;
+                                               case HSSFCell.CELL_TYPE_NUMERIC:
+                                                       // Note - we don't apply any formatting!
+                                                       text.append(cell.getNumericCellValue());
+                                                       outputContents = true;
+                                                       break;
+                                               case HSSFCell.CELL_TYPE_BOOLEAN:
+                                                       text.append(cell.getBooleanCellValue());
+                                                       outputContents = true;
+                                                       break;
+                                               case HSSFCell.CELL_TYPE_FORMULA:
+                                                       if(formulasNotResults) {
+                                                               text.append(cell.getCellFormula());
+                                                       } else {
+                                                               // Try it as a string, if not as a number
+                                                               HSSFRichTextString str = 
+                                                                       cell.getRichStringCellValue();
+                                                               if(str != null && str.length() > 0) {
+                                                                       text.append(str.toString());
+                                                               } else {
+                                                                       // Try and treat it as a number
+                                                                       double val = cell.getNumericCellValue();
+                                                                       text.append(val);
+                                                               }
+                                                       }
+                                                       outputContents = true;
+                                                       break;
+                                       }
+                                       
+                                       // Output a tab if we're not on the last cell
+                                       if(outputContents && k < (lastCell-1)) {
+                                               text.append("\t");
+                                       }
+                               }
+                               
+                               // Finish off the row
+                               text.append("\n");
+                       }
+               }
+               
+               return text.toString();
+       }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java

index b2c4ee37f9faa8e5b868f0d94abc5c4a7d18efc8..a7857e46f29ea426ed990b2860ebdec38e803caf 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
@@ -21,6 +21,7 @@ import java.io.IOException;
  import java.io.InputStream;
  import java.util.ArrayList;
  
+import org.apache.poi.POITextExtractor;
  import org.apache.poi.hdgf.HDGFDiagram;
  import org.apache.poi.hdgf.chunks.Chunk.Command;
  import org.apache.poi.hdgf.streams.ChunkStream;
@@ -33,11 +34,12 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
   * Can opperate on the command line (outputs to stdout), or
   *  can return the text for you (eg for use with Lucene).
   */
-public class VisioTextExtractor {
+public class VisioTextExtractor extends POITextExtractor {
         private HDGFDiagram hdgf;
         private POIFSFileSystem fs;
  
         public VisioTextExtractor(HDGFDiagram hdgf) {
+               super(hdgf);
                 this.hdgf = hdgf;
         }
         public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
@@ -84,6 +86,8 @@ public class VisioTextExtractor {
         
         /**
          * Returns the textual contents of the file.
+        * Each textual object's text will be separated
+        *  by a newline
          */
         public String getText() {
                 StringBuffer text = new StringBuffer();
diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java

index e393b4620bbac191a3394ec359c8af3a293be9da..0fc6f5e847754c88b279097fcee974f06d32ccec 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@@ -22,6 +22,8 @@ package org.apache.poi.hslf.extractor;
  
  import java.io.*;
  import java.util.HashSet;
+
+import org.apache.poi.POITextExtractor;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  import org.apache.poi.hslf.*;
  import org.apache.poi.hslf.model.*;
@@ -34,12 +36,12 @@ import org.apache.poi.hslf.usermodel.*;
   * @author Nick Burch
   */
  
-public class PowerPointExtractor
+public class PowerPointExtractor extends POITextExtractor
  {
-  private HSLFSlideShow _hslfshow;
-  private SlideShow _show;
-  private Slide[] _slides;
-  private Notes[] _notes;
+       private HSLFSlideShow _hslfshow;
+       private SlideShow _show;
+       private Slide[] _slides;
+       private Notes[] _notes;
  
    /**
     * Basic extractor. Returns all the text, and optionally all the notes
@@ -66,61 +68,50 @@ public class PowerPointExtractor
         ppe.close();
    }
  
-  /**
-   * Creates a PowerPointExtractor, from a file
-   * @param fileName The name of the file to extract from
-   */
-  public PowerPointExtractor(String fileName) throws IOException {
-       _hslfshow = new HSLFSlideShow(fileName);
-       _show = new SlideShow(_hslfshow);
-       _slides = _show.getSlides();
-       _notes = _show.getNotes();
-  }
-
-  /**
-   * Creates a PowerPointExtractor, from an Input Stream
-   * @param iStream The input stream containing the PowerPoint document
-   */
-  public PowerPointExtractor(InputStream iStream) throws IOException {
-       _hslfshow = new HSLFSlideShow(iStream);
-       _show = new SlideShow(_hslfshow);
-       _slides = _show.getSlides();
-       _notes = _show.getNotes();
-  }
-
-  /**
-   * Creates a PowerPointExtractor, from an open POIFSFileSystem
-   * @param fs the POIFSFileSystem containing the PowerPoint document
-   */
-  public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
-       _hslfshow = new HSLFSlideShow(fs);
-       _show = new SlideShow(_hslfshow);
-       _slides = _show.getSlides();
-       _notes = _show.getNotes();
-  }
-
-  /**
-   * Creates a PowerPointExtractor, from a HSLFSlideShow
-   * @param ss the HSLFSlideShow to extract text from
-   */
-  public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
-       _hslfshow = ss;
-       _show = new SlideShow(_hslfshow);
-       _slides = _show.getSlides();
-       _notes = _show.getNotes();
-  }
+       /**
+        * Creates a PowerPointExtractor, from a file
+        * @param fileName The name of the file to extract from
+        */
+       public PowerPointExtractor(String fileName) throws IOException {
+               this(new FileInputStream(fileName));
+       }
+       /**
+        * Creates a PowerPointExtractor, from an Input Stream
+        * @param iStream The input stream containing the PowerPoint document
+        */
+       public PowerPointExtractor(InputStream iStream) throws IOException {
+               this(new POIFSFileSystem(iStream));
+       }
+       /**
+        * Creates a PowerPointExtractor, from an open POIFSFileSystem
+        * @param fs the POIFSFileSystem containing the PowerPoint document
+        */
+       public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
+               this(new HSLFSlideShow(fs));
+       }
  
+       /**
+        * Creates a PowerPointExtractor, from a HSLFSlideShow
+        * @param ss the HSLFSlideShow to extract text from
+        */
+       public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
+               super(ss);
+               _hslfshow = ss;
+               _show = new SlideShow(_hslfshow);
+               _slides = _show.getSlides();
+               _notes = _show.getNotes();
+       }
  
-  /**
-   * Shuts down the underlying streams
-   */
-  public void close() throws IOException {
-       _hslfshow.close();
-       _hslfshow = null;
-       _show = null;
-       _slides = null;
-       _notes = null;
-  }
+       /**
+        * Shuts down the underlying streams
+        */
+       public void close() throws IOException {
+               _hslfshow.close();
+               _hslfshow = null;
+               _show = null;
+               _slides = null;
+               _notes = null;
+       }
  
  
    /**
@@ -195,4 +186,4 @@ public class PowerPointExtractor
  
         return ret.toString();
    }
-}
+}
+\ No newline at end of file
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

index dac3a969ddf53bb2a00e77d4d6fb7d602f1d1b5a..6f15ee1f9ac25fdb457eef1756084c7015557b52 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
  import java.io.UnsupportedEncodingException;
  import java.util.Iterator;
  
+import org.apache.poi.POITextExtractor;
  import org.apache.poi.hwpf.HWPFDocument;
  import org.apache.poi.hwpf.model.TextPiece;
  import org.apache.poi.hwpf.usermodel.Paragraph;
@@ -36,7 +37,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
   *
   * @author Nick Burch (nick at torchbox dot com)
   */
-public class WordExtractor {
+public class WordExtractor extends POITextExtractor {
         private POIFSFileSystem fs;
         private HWPFDocument doc;
         
@@ -62,6 +63,7 @@ public class WordExtractor {
          * @param doc The HWPFDocument to extract from
          */
         public WordExtractor(HWPFDocument doc) throws IOException {
+               super(doc);
                 this.doc = doc;
         }
  
diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java

new file mode 100644 (file)

index 0000000..027495a
--- /dev/null
+++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
@@ -0,0 +1,101 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import junit.framework.TestCase;
+
+public class TestExcelExtractor extends TestCase {
+       public void testSimple() throws Exception {
+               String path = System.getProperty("HSSF.testdata.path");
+               FileInputStream fin = new FileInputStream(path + File.separator + "Simple.xls");
+               
+               ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+               
+               assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText());
+               
+               // Now turn off sheet names
+               extractor.setIncludeSheetNames(false);
+               assertEquals("replaceMe\n", extractor.getText());
+       }
+       
+       public void testNumericFormula() throws Exception {
+               String path = System.getProperty("HSSF.testdata.path");
+               FileInputStream fin = new FileInputStream(path + File.separator + "sumifformula.xls");
+               
+               ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+               
+               assertEquals(
+                               "Sheet1\n" +
+                               "1000.0\t1.0\t5.0\n" +
+                               "2000.0\t2.0\t\n" +     
+                               "3000.0\t3.0\t\n" +
+                               "4000.0\t4.0\t\n" + 
+                               "5000.0\t5.0\t\n" +
+                               "Sheet2\nSheet3\n", 
+                               extractor.getText()
+               );
+               
+               extractor.setFormulasNotResults(true);
+               
+               assertEquals(
+                               "Sheet1\n" +
+                               "1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
+                               "2000.0\t2.0\t\n" +     
+                               "3000.0\t3.0\t\n" +
+                               "4000.0\t4.0\t\n" + 
+                               "5000.0\t5.0\t\n" +
+                               "Sheet2\nSheet3\n", 
+                               extractor.getText()
+               );
+       }
+       
+       
+       public void testStringConcat() throws Exception {
+               String path = System.getProperty("HSSF.testdata.path");
+               FileInputStream fin = new FileInputStream(path + File.separator + "SimpleWithFormula.xls");
+               
+               ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+               
+               // Comes out as NaN if treated as a number
+               // And as XYZ if treated as a string
+               assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText());
+               
+               extractor.setFormulasNotResults(true);
+               
+               assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText());
+       }
+       
+       public void testStringFormula() throws Exception {
+               String path = System.getProperty("HSSF.testdata.path");
+               FileInputStream fin = new FileInputStream(path + File.separator + "StringFormulas.xls");
+               
+               ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
+               
+               // Comes out as NaN if treated as a number
+               // And as XYZ if treated as a string
+               assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText());
+               
+               extractor.setFormulasNotResults(true);
+               
+               assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
+       }
+}
author	Nick Burch <nick@apache.org>
	Sat, 27 Oct 2007 21:57:10 +0000 (21:57 +0000)
committer	Nick Burch <nick@apache.org>
	Sat, 27 Oct 2007 21:57:10 +0000 (21:57 +0000)
src/java/org/apache/poi/POITextExtractor.java	[new file with mode: 0644]	patch \| blob
src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java		patch \| blob \| history
src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java	[new file with mode: 0644]	patch \| blob