From: Nick Burch Date: Sat, 27 Oct 2007 21:57:10 +0000 (+0000) Subject: Implement an Excel text extractor, and put all the existing text extractors under... X-Git-Tag: REL_3_0_2_BETA1~21 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=6a72c5656aa8a5e70ead0c4a00d41c9c81b171cc;p=poi.git Implement an Excel text extractor, and put all the existing text extractors under a common superclass, so they're easier to find and use git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589224 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java new file mode 100644 index 0000000000..3ba71880eb --- /dev/null +++ b/src/java/org/apache/poi/POITextExtractor.java @@ -0,0 +1,49 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi; + +/** + * Common Parent for Text Extractors + * of POI Documents. + * You will typically find the implementation of + * a given format's text extractor under + * org.apache.poi.[format].extractor . + * @see org.apache.poi.hssf.extractor.ExcelExtractor + * @see org.apache.poi.hslf.extractor.PowerPointExtractor + * @see org.apache.poi.hdgf.extractor.VisioTextExtractor + * @see org.apache.poi.hwpf.extractor.WordExtractor + */ +public abstract class POITextExtractor { + /** The POIDocument that's open */ + protected POIDocument document; + + /** + * Creates a new text extractor for the given document + */ + public POITextExtractor(POIDocument document) { + this.document = document; + } + + /** + * Retrieves all the text from the document. + * How cells, paragraphs etc are separated in the text + * is implementation specific - see the javadocs for + * a specific project for details. + * @return All the text from the document + */ + public abstract String getText(); +} diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java new file mode 100644 index 0000000000..f45f54dff1 --- /dev/null +++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java @@ -0,0 +1,144 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.extractor; + +import java.io.IOException; + +import org.apache.poi.POITextExtractor; +import org.apache.poi.hssf.usermodel.HSSFCell; +import org.apache.poi.hssf.usermodel.HSSFRichTextString; +import org.apache.poi.hssf.usermodel.HSSFRow; +import org.apache.poi.hssf.usermodel.HSSFSheet; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * A text extractor for Excel files. + * Returns the textual content of the file, suitable for + * indexing by something like Lucene, but not really + * intended for display to the user. + * To turn an excel file into a CSV or similar, then see + * the XLS2CSVmra example + * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra + */ +public class ExcelExtractor extends POITextExtractor{ + private HSSFWorkbook wb; + private boolean includeSheetNames = true; + private boolean formulasNotResults = false; + + public ExcelExtractor(HSSFWorkbook wb) { + super(wb); + this.wb = wb; + } + public ExcelExtractor(POIFSFileSystem fs) throws IOException { + this(new HSSFWorkbook(fs)); + } + + + /** + * Should sheet names be included? Default is true + */ + public void setIncludeSheetNames(boolean includeSheetNames) { + this.includeSheetNames = includeSheetNames; + } + /** + * Should we return the formula itself, and not + * the result it produces? Default is false + */ + public void setFormulasNotResults(boolean formulasNotResults) { + this.formulasNotResults = formulasNotResults; + } + + /** + * Retreives the text contents of the file + */ + public String getText() { + StringBuffer text = new StringBuffer(); + + for(int i=0;i 0) { + text.append(str.toString()); + } else { + // Try and treat it as a number + double val = cell.getNumericCellValue(); + text.append(val); + } + } + outputContents = true; + break; + } + + // Output a tab if we're not on the last cell + if(outputContents && k < (lastCell-1)) { + text.append("\t"); + } + } + + // Finish off the row + text.append("\n"); + } + } + + return text.toString(); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java index b2c4ee37f9..a7857e46f2 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; +import org.apache.poi.POITextExtractor; import org.apache.poi.hdgf.HDGFDiagram; import org.apache.poi.hdgf.chunks.Chunk.Command; import org.apache.poi.hdgf.streams.ChunkStream; @@ -33,11 +34,12 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * Can opperate on the command line (outputs to stdout), or * can return the text for you (eg for use with Lucene). */ -public class VisioTextExtractor { +public class VisioTextExtractor extends POITextExtractor { private HDGFDiagram hdgf; private POIFSFileSystem fs; public VisioTextExtractor(HDGFDiagram hdgf) { + super(hdgf); this.hdgf = hdgf; } public VisioTextExtractor(POIFSFileSystem fs) throws IOException { @@ -84,6 +86,8 @@ public class VisioTextExtractor { /** * Returns the textual contents of the file. + * Each textual object's text will be separated + * by a newline */ public String getText() { StringBuffer text = new StringBuffer(); diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java index e393b4620b..0fc6f5e847 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java @@ -22,6 +22,8 @@ package org.apache.poi.hslf.extractor; import java.io.*; import java.util.HashSet; + +import org.apache.poi.POITextExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.hslf.*; import org.apache.poi.hslf.model.*; @@ -34,12 +36,12 @@ import org.apache.poi.hslf.usermodel.*; * @author Nick Burch */ -public class PowerPointExtractor +public class PowerPointExtractor extends POITextExtractor { - private HSLFSlideShow _hslfshow; - private SlideShow _show; - private Slide[] _slides; - private Notes[] _notes; + private HSLFSlideShow _hslfshow; + private SlideShow _show; + private Slide[] _slides; + private Notes[] _notes; /** * Basic extractor. Returns all the text, and optionally all the notes @@ -66,61 +68,50 @@ public class PowerPointExtractor ppe.close(); } - /** - * Creates a PowerPointExtractor, from a file - * @param fileName The name of the file to extract from - */ - public PowerPointExtractor(String fileName) throws IOException { - _hslfshow = new HSLFSlideShow(fileName); - _show = new SlideShow(_hslfshow); - _slides = _show.getSlides(); - _notes = _show.getNotes(); - } - - /** - * Creates a PowerPointExtractor, from an Input Stream - * @param iStream The input stream containing the PowerPoint document - */ - public PowerPointExtractor(InputStream iStream) throws IOException { - _hslfshow = new HSLFSlideShow(iStream); - _show = new SlideShow(_hslfshow); - _slides = _show.getSlides(); - _notes = _show.getNotes(); - } - - /** - * Creates a PowerPointExtractor, from an open POIFSFileSystem - * @param fs the POIFSFileSystem containing the PowerPoint document - */ - public PowerPointExtractor(POIFSFileSystem fs) throws IOException { - _hslfshow = new HSLFSlideShow(fs); - _show = new SlideShow(_hslfshow); - _slides = _show.getSlides(); - _notes = _show.getNotes(); - } - - /** - * Creates a PowerPointExtractor, from a HSLFSlideShow - * @param ss the HSLFSlideShow to extract text from - */ - public PowerPointExtractor(HSLFSlideShow ss) throws IOException { - _hslfshow = ss; - _show = new SlideShow(_hslfshow); - _slides = _show.getSlides(); - _notes = _show.getNotes(); - } + /** + * Creates a PowerPointExtractor, from a file + * @param fileName The name of the file to extract from + */ + public PowerPointExtractor(String fileName) throws IOException { + this(new FileInputStream(fileName)); + } + /** + * Creates a PowerPointExtractor, from an Input Stream + * @param iStream The input stream containing the PowerPoint document + */ + public PowerPointExtractor(InputStream iStream) throws IOException { + this(new POIFSFileSystem(iStream)); + } + /** + * Creates a PowerPointExtractor, from an open POIFSFileSystem + * @param fs the POIFSFileSystem containing the PowerPoint document + */ + public PowerPointExtractor(POIFSFileSystem fs) throws IOException { + this(new HSLFSlideShow(fs)); + } + /** + * Creates a PowerPointExtractor, from a HSLFSlideShow + * @param ss the HSLFSlideShow to extract text from + */ + public PowerPointExtractor(HSLFSlideShow ss) throws IOException { + super(ss); + _hslfshow = ss; + _show = new SlideShow(_hslfshow); + _slides = _show.getSlides(); + _notes = _show.getNotes(); + } - /** - * Shuts down the underlying streams - */ - public void close() throws IOException { - _hslfshow.close(); - _hslfshow = null; - _show = null; - _slides = null; - _notes = null; - } + /** + * Shuts down the underlying streams + */ + public void close() throws IOException { + _hslfshow.close(); + _hslfshow = null; + _show = null; + _slides = null; + _notes = null; + } /** @@ -195,4 +186,4 @@ public class PowerPointExtractor return ret.toString(); } -} +} \ No newline at end of file diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index dac3a969dd..6f15ee1f9a 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -22,6 +22,7 @@ import java.io.FileInputStream; import java.io.UnsupportedEncodingException; import java.util.Iterator; +import org.apache.poi.POITextExtractor; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.usermodel.Paragraph; @@ -36,7 +37,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * * @author Nick Burch (nick at torchbox dot com) */ -public class WordExtractor { +public class WordExtractor extends POITextExtractor { private POIFSFileSystem fs; private HWPFDocument doc; @@ -62,6 +63,7 @@ public class WordExtractor { * @param doc The HWPFDocument to extract from */ public WordExtractor(HWPFDocument doc) throws IOException { + super(doc); this.doc = doc; } diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java new file mode 100644 index 0000000000..027495a1b0 --- /dev/null +++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java @@ -0,0 +1,101 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.extractor; + +import java.io.File; +import java.io.FileInputStream; + +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +import junit.framework.TestCase; + +public class TestExcelExtractor extends TestCase { + public void testSimple() throws Exception { + String path = System.getProperty("HSSF.testdata.path"); + FileInputStream fin = new FileInputStream(path + File.separator + "Simple.xls"); + + ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin)); + + assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText()); + + // Now turn off sheet names + extractor.setIncludeSheetNames(false); + assertEquals("replaceMe\n", extractor.getText()); + } + + public void testNumericFormula() throws Exception { + String path = System.getProperty("HSSF.testdata.path"); + FileInputStream fin = new FileInputStream(path + File.separator + "sumifformula.xls"); + + ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin)); + + assertEquals( + "Sheet1\n" + + "1000.0\t1.0\t5.0\n" + + "2000.0\t2.0\t\n" + + "3000.0\t3.0\t\n" + + "4000.0\t4.0\t\n" + + "5000.0\t5.0\t\n" + + "Sheet2\nSheet3\n", + extractor.getText() + ); + + extractor.setFormulasNotResults(true); + + assertEquals( + "Sheet1\n" + + "1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" + + "2000.0\t2.0\t\n" + + "3000.0\t3.0\t\n" + + "4000.0\t4.0\t\n" + + "5000.0\t5.0\t\n" + + "Sheet2\nSheet3\n", + extractor.getText() + ); + } + + + public void testStringConcat() throws Exception { + String path = System.getProperty("HSSF.testdata.path"); + FileInputStream fin = new FileInputStream(path + File.separator + "SimpleWithFormula.xls"); + + ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin)); + + // Comes out as NaN if treated as a number + // And as XYZ if treated as a string + assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText()); + + extractor.setFormulasNotResults(true); + + assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText()); + } + + public void testStringFormula() throws Exception { + String path = System.getProperty("HSSF.testdata.path"); + FileInputStream fin = new FileInputStream(path + File.separator + "StringFormulas.xls"); + + ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin)); + + // Comes out as NaN if treated as a number + // And as XYZ if treated as a string + assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText()); + + extractor.setFormulasNotResults(true); + + assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText()); + } +}