From: Yegor Kozlov Date: Sat, 18 Jul 2009 16:49:56 +0000 (+0000) Subject: updated docs on extraction of embedded objects, misc changes in HSSF X-Git-Tag: REL_3_5-FINAL~75 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=e555c088ce23faadf28f818bfbb9e51cf85b1bbb;p=poi.git updated docs on extraction of embedded objects, misc changes in HSSF git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@795394 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/spreadsheet/quick-guide.xml b/src/documentation/content/xdocs/spreadsheet/quick-guide.xml index 859adf961b..d577d0974e 100644 --- a/src/documentation/content/xdocs/spreadsheet/quick-guide.xml +++ b/src/documentation/content/xdocs/spreadsheet/quick-guide.xml @@ -73,6 +73,7 @@
  • How to adjust column width to fit the contents
  • Hyperlinks
  • Data Validation
  • +
  • Embedded Objects
  • Features @@ -1659,5 +1660,84 @@ Examples: dvConstraint = DVConstraint.createFormulaListConstraint("'Sheet1'!$A$1:$A$3");
    + +
    Embedded Objects +

    It is possible to perform more detailed processing of an embedded Excel, Word or PowerPoint document, + or to work with any other type of embedded object.

    +

    HSSF:

    + + POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream("excel_with_embeded.xls")); + HSSFWorkbook workbook = new HSSFWorkbook(fs); + for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) { + //the OLE2 Class Name of the object + String oleName = obj.getOLE2ClassName(); + if (oleName.equals("Worksheet")) { + DirectoryNode dn = (DirectoryNode) obj.getDirectory(); + HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false); + //System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets()); + } else if (oleName.equals("Document")) { + DirectoryNode dn = (DirectoryNode) obj.getDirectory(); + HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs); + //System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text()); + } else if (oleName.equals("Presentation")) { + DirectoryNode dn = (DirectoryNode) obj.getDirectory(); + SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn, fs)); + //System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length); + } else { + if(obj.hasDirectoryEntry()){ + // The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is + DirectoryNode dn = (DirectoryNode) obj.getDirectory(); + for (Iterator entries = dn.getEntries(); entries.hasNext();) { + Entry entry = (Entry) entries.next(); + //System.out.println(oleName + "." + entry.getName()); + } + } else { + // There is no DirectoryEntry + // Recover the object's data from the HSSFObjectData instance. + byte[] objectData = obj.getObjectData(); + } + } + } + +

    XSSF:

    + + XSSFWorkbook workbook = new XSSFWorkbook("excel_with_embeded.xlsx"); + for (PackagePart pPart : workbook.getAllEmbedds()) { + String contentType = pPart.getContentType(); + // Excel Workbook - either binary or OpenXML + if (contentType.equals("application/vnd.ms-excel")) { + HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream()); + } + // Excel Workbook - OpenXML file format + else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { + OPCPackage docPackage = OPCPackage.open(pPart.getInputStream()); + XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(docPackage); + } + // Word Document - binary (OLE2CDF) file format + else if (contentType.equals("application/msword")) { + HWPFDocument document = new HWPFDocument(pPart.getInputStream()); + } + // Word Document - OpenXML file format + else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { + OPCPackage docPackage = OPCPackage.open(pPart.getInputStream()); + XWPFDocument document = new XWPFDocument(docPackage); + } + // PowerPoint Document - binary file format + else if (contentType.equals("application/vnd.ms-powerpoint")) { + HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream()); + } + // PowerPoint Document - OpenXML file format + else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) { + OPCPackage docPackage = OPCPackage.open(pPart.getInputStream()); + XSLFSlideShow slideShow = new XSLFSlideShow(docPackage); + } + // Any other type of embedded object. + else { + System.out.println("Unknown Embedded Document: " + contentType); + InputStream inputStream = pPart.getInputStream(); + } + } + +
    diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 73abd537df..2af0bd1e9f 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,6 +33,7 @@ + 47535 - fixed WordExtractor to tolerate files with empty footnote block 47517 - Fixed ExtractorFactory to support .xltx and .dotx files 45556 - Support for extraction of footnotes from docx files 45555 - Support for extraction of endnotes from docx files diff --git a/src/documentation/content/xdocs/text-extraction.xml b/src/documentation/content/xdocs/text-extraction.xml index d71a0bf10c..fa7474bc0f 100644 --- a/src/documentation/content/xdocs/text-extraction.xml +++ b/src/documentation/content/xdocs/text-extraction.xml @@ -102,6 +102,50 @@ org.apache.poi.hdgf.extractor.VisioTextExtractor, which will return text for your file.

    + +
    Embedded Objects +

    Extractors already exist for Excel, Word, PowerPoint and Visio; + if one of these objects is embedded into a worksheet, the ExtractorFactory class can be used to recover an extractor for it. +

    + + FileInputStream fis = new FileInputStream(inputFile); + POIFSFileSystem fileSystem = new POIFSFileSystem(fis); + // Firstly, get an extractor for the Workbook + POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem); + // Then a List of extractors for any embedded Excel, Word, PowerPoint + // or Visio objects embedded into it. + POITextExtractor[] embeddedExtractors = ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor); + for (POITextExtractor textExtractor : embeddedExtractors) { + // If the embedded object was an Excel spreadsheet. + if (textExtractor instanceof ExcelExtractor) { + ExcelExtractor excelExtractor = (ExcelExtractor) textExtractor; + System.out.println(excelExtractor.getText()); + } + // A Word Document + else if (textExtractor instanceof WordExtractor) { + WordExtractor wordExtractor = (WordExtractor) textExtractor; + String[] paragraphText = wordExtractor.getParagraphText(); + for (String paragraph : paragraphText) { + System.out.println(paragraph); + } + // Display the document's header and footer text + System.out.println("Footer text: " + wordExtractor.getFooterText()); + System.out.println("Header text: " + wordExtractor.getHeaderText()); + } + // PowerPoint Presentation. + else if (textExtractor instanceof PowerPointExtractor) { + PowerPointExtractor powerPointExtractor = (PowerPointExtractor) textExtractor; + System.out.println("Text: " + powerPointExtractor.getText()); + System.out.println("Notes: " + powerPointExtractor.getNotes()); + } + // Visio Drawing + else if (textExtractor instanceof VisioTextExtractor) { + VisioTextExtractor visioTextExtractor = (VisioTextExtractor) textExtractor; + System.out.println("Text: " + visioTextExtractor.getText()); + } + } + +