From: Nick Burch Date: Wed, 11 Mar 2015 16:30:02 +0000 (+0000) Subject: Fix inconsistent indents X-Git-Tag: REL_3_12_FINAL~89 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=b22b9cf0d52da7adc2f81ac401f970766723c122;p=poi.git Fix inconsistent indents git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1665933 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index 35198a80ea..b2edea6a3a 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -47,261 +47,260 @@ import org.apache.poi.xwpf.extractor.XWPFWordExtractor; * Test that the extractor factory plays nicely */ public class TestExtractorFactory extends TestCase { + private File txt; + + private File xls; + private File xlsx; + private File xltx; + private File xlsEmb; + + private File doc; + private File doc6; + private File doc95; + private File docx; + private File dotx; + private File docEmb; + private File docEmbOOXML; + + private File ppt; + private File pptx; + + private File msg; + private File msgEmb; + private File msgEmbMsg; + + private File vsd; + private File vsdx; + + private File pub; + + private File getFileAndCheck(POIDataSamples samples, String name) { + File file = samples.getFile(name); + + assertNotNull("Did not get a file for " + name, file); + assertTrue("Did not get a type file for " + name, file.isFile()); + assertTrue("File did not exist: " + name, file.exists()); + + return file; + } + @Override + protected void setUp() throws Exception { + super.setUp(); + + POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); + xls = getFileAndCheck(ssTests, "SampleSS.xls"); + xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx"); + xltx = getFileAndCheck(ssTests, "test.xltx"); + xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls"); + + POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); + doc = getFileAndCheck(wpTests, "SampleDoc.doc"); + doc6 = getFileAndCheck(wpTests, "Word6.doc"); + doc95 = getFileAndCheck(wpTests, "Word95.doc"); + docx = getFileAndCheck(wpTests, "SampleDoc.docx"); + dotx = getFileAndCheck(wpTests, "test.dotx"); + docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc"); + docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc"); + + POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); + ppt = getFileAndCheck(slTests, "SampleShow.ppt"); + pptx = getFileAndCheck(slTests, "SampleShow.pptx"); + txt = getFileAndCheck(slTests, "SampleShow.txt"); + + POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); + vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd"); + vsdx = getFileAndCheck(dgTests, "test.vsdx"); + + POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); + pub = getFileAndCheck(pubTests, "Simple.pub"); + + POIDataSamples olTests = POIDataSamples.getHSMFInstance(); + msg = getFileAndCheck(olTests, "quick.msg"); + msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg"); + msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg"); + } + + public void testFile() throws Exception { + // Excel + POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls); + assertNotNull("Had empty extractor for " + xls, xlsExtractor); + assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), + xlsExtractor + instanceof ExcelExtractor + ); + assertTrue( + xlsExtractor.getText().length() > 200 + ); + xlsExtractor.close(); + + POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx); + assertTrue( + extractor + instanceof XSSFExcelExtractor + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(xlsx); + assertTrue( + extractor.getText().length() > 200 + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(xltx); + assertTrue( + extractor + instanceof XSSFExcelExtractor + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(xltx); + assertTrue( + extractor.getText().contains("test") + ); + extractor.close(); + + + // Word + assertTrue( + ExtractorFactory.createExtractor(doc) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(doc).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(doc6) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(doc6).getText().length() > 20 + ); + + assertTrue( + ExtractorFactory.createExtractor(doc95) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(doc95).getText().length() > 120 + ); + + extractor = ExtractorFactory.createExtractor(docx); + assertTrue( + extractor instanceof XWPFWordExtractor + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(docx); + assertTrue( + extractor.getText().length() > 120 + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(dotx); + assertTrue( + extractor instanceof XWPFWordExtractor + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(dotx); + assertTrue( + extractor.getText().contains("Test") + ); + extractor.close(); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(ppt) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(ppt).getText().length() > 120 + ); + + extractor = ExtractorFactory.createExtractor(pptx); + assertTrue( + extractor + instanceof XSLFPowerPointExtractor + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(pptx); + assertTrue( + extractor.getText().length() > 120 + ); + extractor.close(); + + // Visio - binary + assertTrue( + ExtractorFactory.createExtractor(vsd) + instanceof VisioTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(vsd).getText().length() > 50 + ); + // Visio - vsdx + try { + ExtractorFactory.createExtractor(vsdx); + fail(); + } catch(IllegalArgumentException e) { + // Good + } + + // Publisher + assertTrue( + ExtractorFactory.createExtractor(pub) + instanceof PublisherTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(pub).getText().length() > 50 + ); + + // Outlook msg + assertTrue( + ExtractorFactory.createExtractor(msg) + instanceof OutlookTextExtactor + ); + assertTrue( + ExtractorFactory.createExtractor(msg).getText().length() > 50 + ); + + // Text + try { + ExtractorFactory.createExtractor(txt); + fail(); + } catch(IllegalArgumentException e) { + // Good + } + } + + public void testInputStream() throws Exception { + // Excel + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(xls)) + instanceof ExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(xls)).getText().length() > 200 + ); + + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(xlsx)) + instanceof XSSFExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(xlsx)).getText().length() > 200 + ); + + // Word + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(doc)) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120 + ); - private File txt; - - private File xls; - private File xlsx; - private File xltx; - private File xlsEmb; - - private File doc; - private File doc6; - private File doc95; - private File docx; - private File dotx; - private File docEmb; - private File docEmbOOXML; - - private File ppt; - private File pptx; - - private File msg; - private File msgEmb; - private File msgEmbMsg; - - private File vsd; - private File vsdx; - - private File pub; - - private File getFileAndCheck(POIDataSamples samples, String name) { - File file = samples.getFile(name); - - assertNotNull("Did not get a file for " + name, file); - assertTrue("Did not get a type file for " + name, file.isFile()); - assertTrue("File did not exist: " + name, file.exists()); - - return file; - } - @Override - protected void setUp() throws Exception { - super.setUp(); - - POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); - xls = getFileAndCheck(ssTests, "SampleSS.xls"); - xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx"); - xltx = getFileAndCheck(ssTests, "test.xltx"); - xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls"); - - POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); - doc = getFileAndCheck(wpTests, "SampleDoc.doc"); - doc6 = getFileAndCheck(wpTests, "Word6.doc"); - doc95 = getFileAndCheck(wpTests, "Word95.doc"); - docx = getFileAndCheck(wpTests, "SampleDoc.docx"); - dotx = getFileAndCheck(wpTests, "test.dotx"); - docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc"); - docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc"); - - POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); - ppt = getFileAndCheck(slTests, "SampleShow.ppt"); - pptx = getFileAndCheck(slTests, "SampleShow.pptx"); - txt = getFileAndCheck(slTests, "SampleShow.txt"); - - POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); - vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd"); - vsdx = getFileAndCheck(dgTests, "test.vsdx"); - - POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); - pub = getFileAndCheck(pubTests, "Simple.pub"); - - POIDataSamples olTests = POIDataSamples.getHSMFInstance(); - msg = getFileAndCheck(olTests, "quick.msg"); - msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg"); - msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg"); - } - - public void testFile() throws Exception { - // Excel - POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls); - assertNotNull("Had empty extractor for " + xls, xlsExtractor); - assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), - xlsExtractor - instanceof ExcelExtractor - ); - assertTrue( - xlsExtractor.getText().length() > 200 - ); - xlsExtractor.close(); - - POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx); - assertTrue( - extractor - instanceof XSSFExcelExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(xlsx); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(xltx); - assertTrue( - extractor - instanceof XSSFExcelExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(xltx); - assertTrue( - extractor.getText().contains("test") - ); - extractor.close(); - - - // Word - assertTrue( - ExtractorFactory.createExtractor(doc) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(doc).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(doc6) - instanceof Word6Extractor - ); - assertTrue( - ExtractorFactory.createExtractor(doc6).getText().length() > 20 - ); - - assertTrue( - ExtractorFactory.createExtractor(doc95) - instanceof Word6Extractor - ); - assertTrue( - ExtractorFactory.createExtractor(doc95).getText().length() > 120 - ); - - extractor = ExtractorFactory.createExtractor(docx); - assertTrue( - extractor instanceof XWPFWordExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(docx); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(dotx); - assertTrue( - extractor instanceof XWPFWordExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(dotx); - assertTrue( - extractor.getText().contains("Test") - ); - extractor.close(); - - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(ppt) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(ppt).getText().length() > 120 - ); - - extractor = ExtractorFactory.createExtractor(pptx); - assertTrue( - extractor - instanceof XSLFPowerPointExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(pptx); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - // Visio - binary - assertTrue( - ExtractorFactory.createExtractor(vsd) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(vsd).getText().length() > 50 - ); - // Visio - vsdx - try { - ExtractorFactory.createExtractor(vsdx); - fail(); - } catch(IllegalArgumentException e) { - // Good - } - - // Publisher - assertTrue( - ExtractorFactory.createExtractor(pub) - instanceof PublisherTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(pub).getText().length() > 50 - ); - - // Outlook msg - assertTrue( - ExtractorFactory.createExtractor(msg) - instanceof OutlookTextExtactor - ); - assertTrue( - ExtractorFactory.createExtractor(msg).getText().length() > 50 - ); - - // Text - try { - ExtractorFactory.createExtractor(txt); - fail(); - } catch(IllegalArgumentException e) { - // Good - } - } - - public void testInputStream() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(xls)) - instanceof ExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(xls)).getText().length() > 200 - ); - - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(xlsx)) - instanceof XSSFExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(xlsx)).getText().length() > 200 - ); - - // Word - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(doc)) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120 - ); - assertTrue( ExtractorFactory.createExtractor(new FileInputStream(doc6)) instanceof Word6Extractor @@ -309,7 +308,7 @@ public class TestExtractorFactory extends TestCase { assertTrue( ExtractorFactory.createExtractor(new FileInputStream(doc6)).getText().length() > 20 ); - + assertTrue( ExtractorFactory.createExtractor(new FileInputStream(doc95)) instanceof Word6Extractor @@ -317,99 +316,99 @@ public class TestExtractorFactory extends TestCase { assertTrue( ExtractorFactory.createExtractor(new FileInputStream(doc95)).getText().length() > 120 ); - - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(docx)) - instanceof XWPFWordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(docx)).getText().length() > 120 - ); - - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(ppt)) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(ppt)).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(pptx)) - instanceof XSLFPowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(pptx)).getText().length() > 120 - ); - - // Visio - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(vsd)) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 - ); - // Visio - vsdx - try { - ExtractorFactory.createExtractor(new FileInputStream(vsdx)); - fail(); - } catch(IllegalArgumentException e) { - // Good - } - - // Publisher - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(pub)) - instanceof PublisherTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(pub)).getText().length() > 50 - ); - - // Outlook msg - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(msg)) - instanceof OutlookTextExtactor - ); - assertTrue( - ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50 - ); - - // Text - try { - FileInputStream stream = new FileInputStream(txt); - try { + + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(docx)) + instanceof XWPFWordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(docx)).getText().length() > 120 + ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(ppt)) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(ppt)).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(pptx)) + instanceof XSLFPowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(pptx)).getText().length() > 120 + ); + + // Visio + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(vsd)) + instanceof VisioTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 + ); + // Visio - vsdx + try { + ExtractorFactory.createExtractor(new FileInputStream(vsdx)); + fail(); + } catch(IllegalArgumentException e) { + // Good + } + + // Publisher + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(pub)) + instanceof PublisherTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(pub)).getText().length() > 50 + ); + + // Outlook msg + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(msg)) + instanceof OutlookTextExtactor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50 + ); + + // Text + try { + FileInputStream stream = new FileInputStream(txt); + try { ExtractorFactory.createExtractor(stream); - fail(); - } finally { - stream.close(); - } - } catch(IllegalArgumentException e) { - // Good - } - } - - public void testPOIFS() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) - instanceof ExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 - ); - - // Word - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 - ); - + fail(); + } finally { + stream.close(); + } + } catch(IllegalArgumentException e) { + // Good + } + } + + public void testPOIFS() throws Exception { + // Excel + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) + instanceof ExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 + ); + + // Word + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 + ); + assertTrue( ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))) instanceof Word6Extractor @@ -417,7 +416,7 @@ public class TestExtractorFactory extends TestCase { assertTrue( ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20 ); - + assertTrue( ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))) instanceof Word6Extractor @@ -425,297 +424,297 @@ public class TestExtractorFactory extends TestCase { assertTrue( ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120 ); - - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 - ); - - // Visio - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 - ); - - // Publisher - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))) - instanceof PublisherTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50 - ); - - // Outlook msg - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))) - instanceof OutlookTextExtactor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50 - ); - - // Text - try { - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt))); - fail(); - } catch(IOException e) { - // Good - } - } - - public void testPackage() throws Exception { - // Excel - POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); - assertTrue( - extractor - instanceof XSSFExcelExtractor - ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 + ); + + // Visio + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))) + instanceof VisioTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 + ); + + // Publisher + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))) + instanceof PublisherTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50 + ); + + // Outlook msg + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))) + instanceof OutlookTextExtactor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50 + ); + + // Text + try { + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt))); + fail(); + } catch(IOException e) { + // Good + } + } + + public void testPackage() throws Exception { + // Excel + POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); + assertTrue( + extractor + instanceof XSSFExcelExtractor + ); extractor.close(); - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); + extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); assertTrue(extractor.getText().length() > 200); extractor.close(); - - // Word - extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString())); + + // Word + extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString())); assertTrue( - extractor - instanceof XWPFWordExtractor - ); + extractor + instanceof XWPFWordExtractor + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString())); + assertTrue( + extractor.getText().length() > 120 + ); extractor.close(); - - extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString())); + + // PowerPoint + extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString())); assertTrue( - extractor.getText().length() > 120 - ); + extractor + instanceof XSLFPowerPointExtractor + ); extractor.close(); - - // PowerPoint - extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString())); + + extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString())); assertTrue( - extractor - instanceof XSLFPowerPointExtractor - ); + extractor.getText().length() > 120 + ); extractor.close(); - extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString())); + // Text + try { + ExtractorFactory.createExtractor(OPCPackage.open(txt.toString())); + fail(); + } catch(InvalidOperationException e) { + // Good + } + } + + public void testPreferEventBased() throws Exception { + assertFalse(ExtractorFactory.getPreferEventExtractor()); + assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); + assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); + + ExtractorFactory.setThreadPrefersEventExtractors(true); + + assertTrue(ExtractorFactory.getPreferEventExtractor()); + assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); + assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); + + ExtractorFactory.setAllThreadsPreferEventExtractors(false); + + assertFalse(ExtractorFactory.getPreferEventExtractor()); + assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); + assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors()); + + ExtractorFactory.setAllThreadsPreferEventExtractors(null); + + assertTrue(ExtractorFactory.getPreferEventExtractor()); + assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); + assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); + + + // Check we get the right extractors now + POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); assertTrue( - extractor.getText().length() > 120 - ); + extractor + instanceof EventBasedExcelExtractor + ); + extractor.close(); + extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); + assertTrue( + extractor.getText().length() > 200 + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); + assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); + assertTrue( + extractor.getText().length() > 200 + ); + extractor.close(); + + + // Put back to normal + ExtractorFactory.setThreadPrefersEventExtractors(false); + assertFalse(ExtractorFactory.getPreferEventExtractor()); + assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); + assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); + + // And back + extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); + assertTrue( + extractor + instanceof ExcelExtractor + ); + extractor.close(); + extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); + assertTrue( + extractor.getText().length() > 200 + ); + extractor.close(); + + extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); + assertTrue( + extractor + instanceof XSSFExcelExtractor + ); + extractor.close(); + extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); + assertTrue( + extractor.getText().length() > 200 + ); extractor.close(); - - // Text - try { - ExtractorFactory.createExtractor(OPCPackage.open(txt.toString())); - fail(); - } catch(InvalidOperationException e) { - // Good - } - } - - public void testPreferEventBased() throws Exception { - assertFalse(ExtractorFactory.getPreferEventExtractor()); - assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); - assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); - - ExtractorFactory.setThreadPrefersEventExtractors(true); - - assertTrue(ExtractorFactory.getPreferEventExtractor()); - assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); - assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); - - ExtractorFactory.setAllThreadsPreferEventExtractors(false); - - assertFalse(ExtractorFactory.getPreferEventExtractor()); - assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); - assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors()); - - ExtractorFactory.setAllThreadsPreferEventExtractors(null); - - assertTrue(ExtractorFactory.getPreferEventExtractor()); - assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); - assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); - - - // Check we get the right extractors now - POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); - assertTrue( - extractor - instanceof EventBasedExcelExtractor - ); - extractor.close(); - extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); - assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - - // Put back to normal - ExtractorFactory.setThreadPrefersEventExtractors(false); - assertFalse(ExtractorFactory.getPreferEventExtractor()); - assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); - assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); - - // And back - extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); - assertTrue( - extractor - instanceof ExcelExtractor - ); - extractor.close(); - extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); - assertTrue( - extractor - instanceof XSSFExcelExtractor - ); - extractor.close(); - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - } - - /** - * Test embeded docs text extraction. For now, only - * does poifs embeded, but will do ooxml ones - * at some point. - */ - public void testEmbeded() throws Exception { - POIOLE2TextExtractor ext; - POITextExtractor[] embeds; - - // No embedings - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(xls); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - assertEquals(0, embeds.length); - - // Excel - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(xlsEmb); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - assertEquals(6, embeds.length); - int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX; - for(int i=0; i 20); - - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; - } - assertEquals(2, numPpt); - assertEquals(2, numXls); - assertEquals(2, numWord); - assertEquals(0, numMsg); - - // Word - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(docEmb); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; - assertEquals(4, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; - } - assertEquals(1, numPpt); - assertEquals(2, numXls); - assertEquals(1, numWord); - assertEquals(0, numMsg); - - // Word which contains an OOXML file - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(docEmbOOXML); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0; - assertEquals(3, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; - else if(embeds[i] instanceof XWPFWordExtractor) numWordX++; - } - assertEquals(1, numPpt); - assertEquals(1, numXls); - assertEquals(0, numWord); - assertEquals(1, numWordX); - assertEquals(0, numMsg); - - // Outlook - ext = (OutlookTextExtactor) - ExtractorFactory.createExtractor(msgEmb); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; - assertEquals(1, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; - } - assertEquals(0, numPpt); - assertEquals(0, numXls); - assertEquals(1, numWord); - assertEquals(0, numMsg); - - // Outlook with another outlook file in it - ext = (OutlookTextExtactor) - ExtractorFactory.createExtractor(msgEmbMsg); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; - assertEquals(1, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; - } - assertEquals(0, numPpt); - assertEquals(0, numXls); - assertEquals(0, numWord); - assertEquals(1, numMsg); - - - // TODO - PowerPoint - // TODO - Publisher - // TODO - Visio - } + } + + /** + * Test embeded docs text extraction. For now, only + * does poifs embeded, but will do ooxml ones + * at some point. + */ + public void testEmbeded() throws Exception { + POIOLE2TextExtractor ext; + POITextExtractor[] embeds; + + // No embedings + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(xls); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + assertEquals(0, embeds.length); + + // Excel + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(xlsEmb); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + assertEquals(6, embeds.length); + int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX; + for(int i=0; i 20); + + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + } + assertEquals(2, numPpt); + assertEquals(2, numXls); + assertEquals(2, numWord); + assertEquals(0, numMsg); + + // Word + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(docEmb); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; + assertEquals(4, embeds.length); + for(int i=0; i 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + } + assertEquals(1, numPpt); + assertEquals(2, numXls); + assertEquals(1, numWord); + assertEquals(0, numMsg); + + // Word which contains an OOXML file + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(docEmbOOXML); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0; + assertEquals(3, embeds.length); + for(int i=0; i 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + else if(embeds[i] instanceof XWPFWordExtractor) numWordX++; + } + assertEquals(1, numPpt); + assertEquals(1, numXls); + assertEquals(0, numWord); + assertEquals(1, numWordX); + assertEquals(0, numMsg); + + // Outlook + ext = (OutlookTextExtactor) + ExtractorFactory.createExtractor(msgEmb); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; + assertEquals(1, embeds.length); + for(int i=0; i 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + } + assertEquals(0, numPpt); + assertEquals(0, numXls); + assertEquals(1, numWord); + assertEquals(0, numMsg); + + // Outlook with another outlook file in it + ext = (OutlookTextExtactor) + ExtractorFactory.createExtractor(msgEmbMsg); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; + assertEquals(1, embeds.length); + for(int i=0; i 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + } + assertEquals(0, numPpt); + assertEquals(0, numXls); + assertEquals(0, numWord); + assertEquals(1, numMsg); + + + // TODO - PowerPoint + // TODO - Publisher + // TODO - Visio + } }