From: Nick Burch Date: Wed, 9 Apr 2008 09:20:20 +0000 (+0000) Subject: Shuffle the common text extractor stuff from scratchpad to ooxml, to match the compil... X-Git-Tag: REL_3_5_BETA2~109 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=0650fb522f1088057ee5a8ebdf3d437269bf855f;p=poi.git Shuffle the common text extractor stuff from scratchpad to ooxml, to match the compile paths git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646239 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java new file mode 100644 index 0000000000..318b68d8f0 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -0,0 +1,128 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.extractor; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; +import java.util.Iterator; + +import org.openxml4j.exceptions.InvalidFormatException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxml4j.opc.PackagePart; +import org.openxml4j.opc.PackageRelationshipCollection; + +import org.apache.poi.POITextExtractor; +import org.apache.poi.POIXMLDocument; +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hdgf.extractor.VisioTextExtractor; +import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.xslf.XSLFSlideShow; +import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xssf.extractor.XSSFExcelExtractor; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.poi.xwpf.XWPFDocument; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.apache.xmlbeans.XmlException; + +/** + * Figures out the correct POITextExtractor for your supplied + * document, and returns it. + */ +public class ExtractorFactory { + public static final String CORE_DOCUMENT_REL = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; + + public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + InputStream inp = new PushbackInputStream( + new FileInputStream(f), 8); + + if(POIFSFileSystem.hasPOIFSHeader(inp)) { + return createExtractor(new POIFSFileSystem(inp)); + } + if(POIXMLDocument.hasOOXMLHeader(inp)) { + inp.close(); + return createExtractor(Package.open(f.toString())); + } + throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); + } + + public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + // Figure out the kind of stream + // If clearly doesn't do mark/reset, wrap up + if(! inp.markSupported()) { + inp = new PushbackInputStream(inp, 8); + } + + if(POIFSFileSystem.hasPOIFSHeader(inp)) { + return createExtractor(new POIFSFileSystem(inp)); + } + if(POIXMLDocument.hasOOXMLHeader(inp)) { + return createExtractor(Package.open(inp)); + } + throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); + } + + public static POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException { + PackageRelationshipCollection core = + pkg.getRelationshipsByType(CORE_DOCUMENT_REL); + if(core.size() != 1) { + throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); + } + + PackagePart corePart = pkg.getPart(core.getRelationship(0)); + if(corePart.getContentType().equals(XSSFWorkbook.WORKBOOK.getContentType())) { + return new XSSFExcelExtractor(pkg); + } + if(corePart.getContentType().equals(XWPFDocument.MAIN_CONTENT_TYPE)) { + return new XWPFWordExtractor(pkg); + } + if(corePart.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) { + return new XSLFPowerPointExtractor(pkg); + } + throw new IllegalArgumentException("No supported documents found in the OOXML package"); + } + + public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { + // Look for certain entries in the stream, to figure it + // out from + for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) { + Entry entry = (Entry)entries.next(); + + if(entry.getName().equals("Workbook")) { + return new ExcelExtractor(fs); + } + if(entry.getName().equals("WordDocument")) { + return new WordExtractor(fs); + } + if(entry.getName().equals("PowerPoint Document")) { + return new PowerPointExtractor(fs); + } + if(entry.getName().equals("VisioDocument")) { + return new VisioTextExtractor(fs); + } + } + throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); + } +} diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java new file mode 100644 index 0000000000..762eb92ec7 --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -0,0 +1,303 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.extractor; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; + +import org.apache.poi.hdgf.extractor.VisioTextExtractor; +import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xssf.extractor.XSSFExcelExtractor; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; + +import junit.framework.TestCase; + +import org.openxml4j.exceptions.InvalidOperationException; +import org.openxml4j.opc.Package; + +/** + * Test that the extractor factory plays nicely + */ +public class TestExtractorFactory extends TestCase { + private String excel_dir; + private String word_dir; + private String powerpoint_dir; + private String visio_dir; + + private File txt; + + private File xls; + private File xlsx; + + private File doc; + private File docx; + + private File ppt; + private File pptx; + + private File vsd; + + protected void setUp() throws Exception { + super.setUp(); + + excel_dir = System.getProperty("HSSF.testdata.path"); + word_dir = System.getProperty("HWPF.testdata.path"); + powerpoint_dir = System.getProperty("HSLF.testdata.path"); + visio_dir = System.getProperty("HDGF.testdata.path"); + + txt = new File(powerpoint_dir, "SampleShow.txt"); + + xls = new File(excel_dir, "SampleSS.xls"); + xlsx = new File(excel_dir, "SampleSS.xlsx"); + + doc = new File(word_dir, "SampleDoc.doc"); + docx = new File(word_dir, "SampleDoc.docx"); + + ppt = new File(powerpoint_dir, "SampleShow.ppt"); + pptx = new File(powerpoint_dir, "SampleShow.pptx"); + + vsd = new File(visio_dir, "Test_Visio-Some_Random_Text.vsd"); + } + + public void testFile() throws Exception { + // Excel + assertTrue( + ExtractorFactory.createExtractor(xls) + instanceof ExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(xls).getText().length() > 200 + ); + + assertTrue( + ExtractorFactory.createExtractor(xlsx) + instanceof XSSFExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(xlsx).getText().length() > 200 + ); + + // Word + assertTrue( + ExtractorFactory.createExtractor(doc) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(doc).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(docx) + instanceof XWPFWordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(docx).getText().length() > 120 + ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(ppt) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(ppt).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(pptx) + instanceof XSLFPowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(pptx).getText().length() > 120 + ); + + // Visio + assertTrue( + ExtractorFactory.createExtractor(vsd) + instanceof VisioTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(vsd).getText().length() > 50 + ); + + // Text + try { + ExtractorFactory.createExtractor(txt); + fail(); + } catch(IllegalArgumentException e) { + // Good + } + } + + public void testInputStream() throws Exception { + // Excel + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(xls)) + instanceof ExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(xls)).getText().length() > 200 + ); + + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(xlsx)) + instanceof XSSFExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(xlsx)).getText().length() > 200 + ); + + // Word + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(doc)) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(docx)) + instanceof XWPFWordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(docx)).getText().length() > 120 + ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(ppt)) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(ppt)).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(pptx)) + instanceof XSLFPowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(pptx)).getText().length() > 120 + ); + + // Visio + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(vsd)) + instanceof VisioTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 + ); + + // Text + try { + ExtractorFactory.createExtractor(new FileInputStream(txt)); + fail(); + } catch(IllegalArgumentException e) { + // Good + } + } + + public void testPOIFS() throws Exception { + // Excel + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) + instanceof ExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 + ); + + // Word + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 + ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 + ); + + // Visio + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))) + instanceof VisioTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 + ); + + // Text + try { + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt))); + fail(); + } catch(IOException e) { + // Good + } + } + + public void testPackage() throws Exception { + // Excel + assertTrue( + ExtractorFactory.createExtractor(Package.open(xlsx.toString())) + instanceof XSSFExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(Package.open(xlsx.toString())).getText().length() > 200 + ); + + // Word + assertTrue( + ExtractorFactory.createExtractor(Package.open(docx.toString())) + instanceof XWPFWordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(Package.open(docx.toString())).getText().length() > 120 + ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(Package.open(pptx.toString())) + instanceof XSLFPowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(Package.open(pptx.toString())).getText().length() > 120 + ); + + // Text + try { + ExtractorFactory.createExtractor(Package.open(txt.toString())); + fail(); + } catch(InvalidOperationException e) { + // Good + } + } +} diff --git a/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java deleted file mode 100644 index 318b68d8f0..0000000000 --- a/src/scratchpad/src/org/apache/poi/extractor/ExtractorFactory.java +++ /dev/null @@ -1,128 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.extractor; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.PushbackInputStream; -import java.util.Iterator; - -import org.openxml4j.exceptions.InvalidFormatException; -import org.openxml4j.exceptions.OpenXML4JException; -import org.openxml4j.opc.Package; -import org.openxml4j.opc.PackagePart; -import org.openxml4j.opc.PackageRelationshipCollection; - -import org.apache.poi.POITextExtractor; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.POIXMLTextExtractor; -import org.apache.poi.hdgf.extractor.VisioTextExtractor; -import org.apache.poi.hslf.extractor.PowerPointExtractor; -import org.apache.poi.hssf.extractor.ExcelExtractor; -import org.apache.poi.hwpf.extractor.WordExtractor; -import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.xslf.XSLFSlideShow; -import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; -import org.apache.poi.xssf.extractor.XSSFExcelExtractor; -import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.apache.poi.xwpf.XWPFDocument; -import org.apache.poi.xwpf.extractor.XWPFWordExtractor; -import org.apache.xmlbeans.XmlException; - -/** - * Figures out the correct POITextExtractor for your supplied - * document, and returns it. - */ -public class ExtractorFactory { - public static final String CORE_DOCUMENT_REL = - "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; - - public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { - InputStream inp = new PushbackInputStream( - new FileInputStream(f), 8); - - if(POIFSFileSystem.hasPOIFSHeader(inp)) { - return createExtractor(new POIFSFileSystem(inp)); - } - if(POIXMLDocument.hasOOXMLHeader(inp)) { - inp.close(); - return createExtractor(Package.open(f.toString())); - } - throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); - } - - public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { - // Figure out the kind of stream - // If clearly doesn't do mark/reset, wrap up - if(! inp.markSupported()) { - inp = new PushbackInputStream(inp, 8); - } - - if(POIFSFileSystem.hasPOIFSHeader(inp)) { - return createExtractor(new POIFSFileSystem(inp)); - } - if(POIXMLDocument.hasOOXMLHeader(inp)) { - return createExtractor(Package.open(inp)); - } - throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); - } - - public static POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException { - PackageRelationshipCollection core = - pkg.getRelationshipsByType(CORE_DOCUMENT_REL); - if(core.size() != 1) { - throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); - } - - PackagePart corePart = pkg.getPart(core.getRelationship(0)); - if(corePart.getContentType().equals(XSSFWorkbook.WORKBOOK.getContentType())) { - return new XSSFExcelExtractor(pkg); - } - if(corePart.getContentType().equals(XWPFDocument.MAIN_CONTENT_TYPE)) { - return new XWPFWordExtractor(pkg); - } - if(corePart.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) { - return new XSLFPowerPointExtractor(pkg); - } - throw new IllegalArgumentException("No supported documents found in the OOXML package"); - } - - public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { - // Look for certain entries in the stream, to figure it - // out from - for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) { - Entry entry = (Entry)entries.next(); - - if(entry.getName().equals("Workbook")) { - return new ExcelExtractor(fs); - } - if(entry.getName().equals("WordDocument")) { - return new WordExtractor(fs); - } - if(entry.getName().equals("PowerPoint Document")) { - return new PowerPointExtractor(fs); - } - if(entry.getName().equals("VisioDocument")) { - return new VisioTextExtractor(fs); - } - } - throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); - } -} diff --git a/src/scratchpad/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/scratchpad/testcases/org/apache/poi/extractor/TestExtractorFactory.java deleted file mode 100644 index 762eb92ec7..0000000000 --- a/src/scratchpad/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ /dev/null @@ -1,303 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.extractor; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; - -import org.apache.poi.hdgf.extractor.VisioTextExtractor; -import org.apache.poi.hslf.extractor.PowerPointExtractor; -import org.apache.poi.hssf.extractor.ExcelExtractor; -import org.apache.poi.hwpf.extractor.WordExtractor; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; -import org.apache.poi.xssf.extractor.XSSFExcelExtractor; -import org.apache.poi.xwpf.extractor.XWPFWordExtractor; - -import junit.framework.TestCase; - -import org.openxml4j.exceptions.InvalidOperationException; -import org.openxml4j.opc.Package; - -/** - * Test that the extractor factory plays nicely - */ -public class TestExtractorFactory extends TestCase { - private String excel_dir; - private String word_dir; - private String powerpoint_dir; - private String visio_dir; - - private File txt; - - private File xls; - private File xlsx; - - private File doc; - private File docx; - - private File ppt; - private File pptx; - - private File vsd; - - protected void setUp() throws Exception { - super.setUp(); - - excel_dir = System.getProperty("HSSF.testdata.path"); - word_dir = System.getProperty("HWPF.testdata.path"); - powerpoint_dir = System.getProperty("HSLF.testdata.path"); - visio_dir = System.getProperty("HDGF.testdata.path"); - - txt = new File(powerpoint_dir, "SampleShow.txt"); - - xls = new File(excel_dir, "SampleSS.xls"); - xlsx = new File(excel_dir, "SampleSS.xlsx"); - - doc = new File(word_dir, "SampleDoc.doc"); - docx = new File(word_dir, "SampleDoc.docx"); - - ppt = new File(powerpoint_dir, "SampleShow.ppt"); - pptx = new File(powerpoint_dir, "SampleShow.pptx"); - - vsd = new File(visio_dir, "Test_Visio-Some_Random_Text.vsd"); - } - - public void testFile() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(xls) - instanceof ExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(xls).getText().length() > 200 - ); - - assertTrue( - ExtractorFactory.createExtractor(xlsx) - instanceof XSSFExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(xlsx).getText().length() > 200 - ); - - // Word - assertTrue( - ExtractorFactory.createExtractor(doc) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(doc).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(docx) - instanceof XWPFWordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(docx).getText().length() > 120 - ); - - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(ppt) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(ppt).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(pptx) - instanceof XSLFPowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(pptx).getText().length() > 120 - ); - - // Visio - assertTrue( - ExtractorFactory.createExtractor(vsd) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(vsd).getText().length() > 50 - ); - - // Text - try { - ExtractorFactory.createExtractor(txt); - fail(); - } catch(IllegalArgumentException e) { - // Good - } - } - - public void testInputStream() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(xls)) - instanceof ExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(xls)).getText().length() > 200 - ); - - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(xlsx)) - instanceof XSSFExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(xlsx)).getText().length() > 200 - ); - - // Word - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(doc)) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(docx)) - instanceof XWPFWordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(docx)).getText().length() > 120 - ); - - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(ppt)) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(ppt)).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(pptx)) - instanceof XSLFPowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(pptx)).getText().length() > 120 - ); - - // Visio - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(vsd)) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 - ); - - // Text - try { - ExtractorFactory.createExtractor(new FileInputStream(txt)); - fail(); - } catch(IllegalArgumentException e) { - // Good - } - } - - public void testPOIFS() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) - instanceof ExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 - ); - - // Word - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 - ); - - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 - ); - - // Visio - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 - ); - - // Text - try { - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt))); - fail(); - } catch(IOException e) { - // Good - } - } - - public void testPackage() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(Package.open(xlsx.toString())) - instanceof XSSFExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(Package.open(xlsx.toString())).getText().length() > 200 - ); - - // Word - assertTrue( - ExtractorFactory.createExtractor(Package.open(docx.toString())) - instanceof XWPFWordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(Package.open(docx.toString())).getText().length() > 120 - ); - - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(Package.open(pptx.toString())) - instanceof XSLFPowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(Package.open(pptx.toString())).getText().length() > 120 - ); - - // Text - try { - ExtractorFactory.createExtractor(Package.open(txt.toString())); - fail(); - } catch(InvalidOperationException e) { - // Good - } - } -}