git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1709361 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_14_BETA1
// Visio - binary | // Visio - binary | ||||
HANDLERS.put(".vsd", new HDGFFileHandler()); | HANDLERS.put(".vsd", new HDGFFileHandler()); | ||||
// Visio - ooxml (currently unsupported) | |||||
// Visio - ooxml | |||||
HANDLERS.put(".vsdm", new XDGFFileHandler()); | HANDLERS.put(".vsdm", new XDGFFileHandler()); | ||||
HANDLERS.put(".vsdx", new XDGFFileHandler()); | HANDLERS.put(".vsdx", new XDGFFileHandler()); | ||||
HANDLERS.put(".vssm", new XDGFFileHandler()); | HANDLERS.put(".vssm", new XDGFFileHandler()); |
==================================================================== */ | ==================================================================== */ | ||||
package org.apache.poi.stress; | package org.apache.poi.stress; | ||||
import java.io.File; | |||||
import java.io.IOException; | |||||
import java.io.InputStream; | import java.io.InputStream; | ||||
import java.util.ArrayList; | |||||
import java.util.List; | |||||
import org.apache.poi.POIXMLDocument; | |||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||||
import org.apache.poi.openxml4j.opc.OPCPackage; | import org.apache.poi.openxml4j.opc.OPCPackage; | ||||
import org.apache.poi.openxml4j.opc.PackageAccess; | import org.apache.poi.openxml4j.opc.PackageAccess; | ||||
import org.apache.poi.openxml4j.opc.PackagePart; | |||||
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; | |||||
import org.apache.poi.util.PackageHelper; | |||||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument; | |||||
import org.junit.Test; | import org.junit.Test; | ||||
public class XDGFFileHandler extends AbstractFileHandler { | public class XDGFFileHandler extends AbstractFileHandler { | ||||
// ignore password protected files | // ignore password protected files | ||||
if (POIXMLDocumentHandler.isEncrypted(stream)) return; | if (POIXMLDocumentHandler.isEncrypted(stream)) return; | ||||
TestXDGFXMLDocument doc = new TestXDGFXMLDocument(stream); | |||||
XmlVisioDocument doc = new XmlVisioDocument(stream); | |||||
new POIXMLDocumentHandler().handlePOIXMLDocument(doc); | new POIXMLDocumentHandler().handlePOIXMLDocument(doc); | ||||
} | } | ||||
@Override | |||||
public void handleExtracting(File file) throws Exception { | |||||
// TODO: extraction/actual operations not supported yet | |||||
} | |||||
// a test-case to test this locally without executing the full TestAllFiles | // a test-case to test this locally without executing the full TestAllFiles | ||||
@Test | @Test | ||||
public void test() throws Exception { | public void test() throws Exception { | ||||
OPCPackage pkg = OPCPackage.open("test-data/diagram/test.vsdx", PackageAccess.READ); | OPCPackage pkg = OPCPackage.open("test-data/diagram/test.vsdx", PackageAccess.READ); | ||||
try { | try { | ||||
TestXDGFXMLDocument doc = new TestXDGFXMLDocument(pkg); | |||||
XmlVisioDocument doc = new XmlVisioDocument(pkg); | |||||
new POIXMLDocumentHandler().handlePOIXMLDocument(doc); | new POIXMLDocumentHandler().handlePOIXMLDocument(doc); | ||||
} finally { | } finally { | ||||
pkg.close(); | pkg.close(); | ||||
} | } | ||||
} | } | ||||
// TODO: Get rid of this when full visio ooxml support is added | |||||
private final static class TestXDGFXMLDocument extends POIXMLDocument { | |||||
public TestXDGFXMLDocument(OPCPackage pkg) { | |||||
super(pkg, PackageRelationshipTypes.VISIO_CORE_DOCUMENT); | |||||
} | |||||
public TestXDGFXMLDocument(InputStream is) throws IOException { | |||||
this(PackageHelper.open(is)); | |||||
} | |||||
public List<PackagePart> getAllEmbedds() throws OpenXML4JException { | |||||
return new ArrayList<PackagePart>(); | |||||
} | |||||
} | |||||
} | } |
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; | import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; | ||||
import org.apache.poi.poifs.filesystem.OfficeXmlFileException; | import org.apache.poi.poifs.filesystem.OfficeXmlFileException; | ||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | import org.apache.poi.poifs.filesystem.POIFSFileSystem; | ||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; | |||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; | import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; | ||||
import org.apache.poi.xslf.usermodel.XSLFRelation; | import org.apache.poi.xslf.usermodel.XSLFRelation; | ||||
import org.apache.poi.xslf.usermodel.XSLFSlideShow; | import org.apache.poi.xslf.usermodel.XSLFSlideShow; | ||||
} | } | ||||
if (core.size() == 0) { | if (core.size() == 0) { | ||||
// Could it be a visio one? | // Could it be a visio one? | ||||
PackageRelationshipCollection visio = | |||||
pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); | |||||
if (visio.size() == 1) { | |||||
throw new IllegalArgumentException("Text extraction not supported for Visio OOXML files"); | |||||
} | |||||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); | |||||
if (core.size() == 1) | |||||
return new XDGFVisioExtractor(pkg); | |||||
} | } | ||||
// Should just be a single core document, complain if not | // Should just be a single core document, complain if not |
package org.apache.poi.xdgf.extractor; | |||||
import java.io.IOException; | |||||
import org.apache.poi.POIXMLDocument; | |||||
import org.apache.poi.POIXMLTextExtractor; | |||||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||||
import org.apache.poi.xdgf.usermodel.XDGFPage; | |||||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument; | |||||
import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor; | |||||
/** | |||||
* Helper class to extract text from an OOXML Visio File | |||||
*/ | |||||
public class XDGFVisioExtractor extends POIXMLTextExtractor { | |||||
protected final XmlVisioDocument document; | |||||
public XDGFVisioExtractor(XmlVisioDocument document) { | |||||
super(document); | |||||
this.document = document; | |||||
} | |||||
public XDGFVisioExtractor(OPCPackage openPackage) throws IOException { | |||||
this(new XmlVisioDocument(openPackage)); | |||||
} | |||||
public String getText() { | |||||
ShapeTextVisitor visitor = new ShapeTextVisitor(); | |||||
for (XDGFPage page: document.getPages()) { | |||||
page.getContent().visitShapes(visitor); | |||||
} | |||||
return visitor.getText().toString(); | |||||
} | |||||
public static void main(String [] args) throws IOException { | |||||
if (args.length < 1) { | |||||
System.err.println("Use:"); | |||||
System.err.println(" XDGFVisioExtractor <filename.vsdx>"); | |||||
System.exit(1); | |||||
} | |||||
POIXMLTextExtractor extractor = | |||||
new XDGFVisioExtractor(POIXMLDocument.openPackage( | |||||
args[0] | |||||
)); | |||||
System.out.println(extractor.getText()); | |||||
extractor.close(); | |||||
} | |||||
} |
/** | /** | ||||
* Represents the root document: /visio/document.xml | * Represents the root document: /visio/document.xml | ||||
* | |||||
* You're probably actually looking for {@link XmlVisioDocument}, this | |||||
* only contains metadata about the root document in the OOXML package. | |||||
*/ | */ | ||||
public class XDGFDocument { | public class XDGFDocument { | ||||
import java.io.IOException; | import java.io.IOException; | ||||
import java.io.InputStream; | import java.io.InputStream; | ||||
import java.util.ArrayList; | |||||
import java.util.Collection; | import java.util.Collection; | ||||
import java.util.List; | import java.util.List; | ||||
_pages.onDocumentRead(); | _pages.onDocumentRead(); | ||||
} | } | ||||
/** | |||||
* Not currently implemented | |||||
*/ | |||||
@Override | @Override | ||||
public List<PackagePart> getAllEmbedds() throws OpenXML4JException { | public List<PackagePart> getAllEmbedds() throws OpenXML4JException { | ||||
throw new UnsupportedOperationException("Not implemented"); | |||||
return new ArrayList<PackagePart>(); | |||||
} | } | ||||
// | // | ||||
// Useful public API goes here | // Useful public API goes here | ||||
// | // | ||||
/** | |||||
* @return pages ordered by page number | |||||
*/ | |||||
public Collection<XDGFPage> getPages() { | public Collection<XDGFPage> getPages() { | ||||
return _pages.getPageList(); | return _pages.getPageList(); | ||||
} | } |
package org.apache.poi.xdgf.usermodel.shape; | |||||
import java.awt.geom.AffineTransform; | |||||
import org.apache.poi.xdgf.usermodel.XDGFShape; | |||||
/** | |||||
* Only visits text nodes, accumulates text content into a string | |||||
* | |||||
* The text is returned in arbitrary order, with no regards to | |||||
* the location of the text on the page. This may change in the | |||||
* future. | |||||
*/ | |||||
public class ShapeTextVisitor extends ShapeVisitor { | |||||
protected StringBuilder text = new StringBuilder(); | |||||
public static class TextAcceptor implements ShapeVisitorAcceptor { | |||||
public boolean accept(XDGFShape shape) { | |||||
return shape.hasText(); | |||||
} | |||||
} | |||||
protected ShapeVisitorAcceptor getAcceptor() { | |||||
return new TextAcceptor(); | |||||
} | |||||
public void visit(XDGFShape shape, AffineTransform globalTransform, | |||||
int level) { | |||||
text.append(shape.getText().getTextContent().trim()); | |||||
text.append('\n'); | |||||
} | |||||
/** | |||||
* Call this after visitation has completed | |||||
*/ | |||||
public String getText() { | |||||
return text.toString(); | |||||
} | |||||
} |
import org.apache.poi.openxml4j.opc.OPCPackage; | import org.apache.poi.openxml4j.opc.OPCPackage; | ||||
import org.apache.poi.openxml4j.opc.PackageAccess; | import org.apache.poi.openxml4j.opc.PackageAccess; | ||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | import org.apache.poi.poifs.filesystem.POIFSFileSystem; | ||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; | |||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; | import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; | ||||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; | import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; | ||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor; | import org.apache.poi.xssf.extractor.XSSFExcelExtractor; | ||||
ExtractorFactory.createExtractor(vsd).getText().length() > 50 | ExtractorFactory.createExtractor(vsd).getText().length() > 50 | ||||
); | ); | ||||
// Visio - vsdx | // Visio - vsdx | ||||
try { | |||||
ExtractorFactory.createExtractor(vsdx); | |||||
fail(); | |||||
} catch(IllegalArgumentException e) { | |||||
// Good | |||||
} | |||||
assertTrue( | |||||
ExtractorFactory.createExtractor(vsdx) | |||||
instanceof XDGFVisioExtractor | |||||
); | |||||
assertTrue( | |||||
ExtractorFactory.createExtractor(vsdx).getText().length() > 20 | |||||
); | |||||
// Publisher | // Publisher | ||||
assertTrue( | assertTrue( | ||||
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 | ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 | ||||
); | ); | ||||
// Visio - vsdx | // Visio - vsdx | ||||
try { | |||||
ExtractorFactory.createExtractor(new FileInputStream(vsdx)); | |||||
fail(); | |||||
} catch(IllegalArgumentException e) { | |||||
// Good | |||||
} | |||||
assertTrue( | |||||
ExtractorFactory.createExtractor(new FileInputStream(vsdx)) | |||||
instanceof XDGFVisioExtractor | |||||
); | |||||
assertTrue( | |||||
ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20 | |||||
); | |||||
// Publisher | // Publisher | ||||
assertTrue( | assertTrue( | ||||
ExtractorFactory.createExtractor(new FileInputStream(pub)) | ExtractorFactory.createExtractor(new FileInputStream(pub)) | ||||
extractor.getText().length() > 120 | extractor.getText().length() > 120 | ||||
); | ); | ||||
extractor.close(); | extractor.close(); | ||||
// Visio | |||||
assertTrue( | |||||
ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString())) | |||||
instanceof XDGFVisioExtractor | |||||
); | |||||
assertTrue( | |||||
extractor.getText().length() > 20 | |||||
); | |||||
// Text | // Text | ||||
try { | try { |
package org.apache.poi.xdgf.extractor; | |||||
import java.io.IOException; | |||||
import org.apache.poi.POIDataSamples; | |||||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument; | |||||
import junit.framework.TestCase; | |||||
public class TestXDGFVisioExtractor extends TestCase { | |||||
private POIDataSamples diagrams; | |||||
private OPCPackage pkg; | |||||
private XmlVisioDocument xml; | |||||
protected void setUp() throws Exception { | |||||
diagrams = POIDataSamples.getDiagramInstance(); | |||||
pkg = OPCPackage.open(diagrams.openResourceAsStream("test_text_extraction.vsdx")); | |||||
xml = new XmlVisioDocument(pkg); | |||||
} | |||||
public void testGetSimpleText() throws IOException { | |||||
new XDGFVisioExtractor(xml).close(); | |||||
new XDGFVisioExtractor(pkg).close(); | |||||
XDGFVisioExtractor extractor = new XDGFVisioExtractor(xml); | |||||
extractor.getText(); | |||||
String text = extractor.getText(); | |||||
assertTrue(text.length() > 0); | |||||
assertEquals("Text here\nText there\nText, text, everywhere!\nRouter here\n", | |||||
text); | |||||
extractor.close(); | |||||
} | |||||
} |