diff options
author | Dustin Spicuzza <virtuald@apache.org> | 2015-10-19 06:26:57 +0000 |
---|---|---|
committer | Dustin Spicuzza <virtuald@apache.org> | 2015-10-19 06:26:57 +0000 |
commit | bc6ee96e1a409cfeae97c6cd2805b2ef9c420ac7 (patch) | |
tree | c4928538118e474ba2895a91b0742563bc5b8de7 | |
parent | 9716fd9a06c1005f8e9383fcdf8c6d6cb6620aa4 (diff) | |
download | poi-bc6ee96e1a409cfeae97c6cd2805b2ef9c420ac7.tar.gz poi-bc6ee96e1a409cfeae97c6cd2805b2ef9c420ac7.zip |
Add Visio OOXML text extractor + tests
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1709361 13f79535-47bb-0310-9956-ffa450edef68
10 files changed, 176 insertions, 51 deletions
diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java index d453da27f9..6231065e7f 100644 --- a/src/integrationtest/org/apache/poi/TestAllFiles.java +++ b/src/integrationtest/org/apache/poi/TestAllFiles.java @@ -105,7 +105,7 @@ public class TestAllFiles { // Visio - binary HANDLERS.put(".vsd", new HDGFFileHandler()); - // Visio - ooxml (currently unsupported) + // Visio - ooxml HANDLERS.put(".vsdm", new XDGFFileHandler()); HANDLERS.put(".vsdx", new XDGFFileHandler()); HANDLERS.put(".vssm", new XDGFFileHandler()); diff --git a/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java b/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java index 4c4fd6088b..9b7d03f8a5 100644 --- a/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java @@ -16,19 +16,11 @@ ==================================================================== */ package org.apache.poi.stress; -import java.io.File; -import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; -import org.apache.poi.POIXMLDocument; -import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; -import org.apache.poi.openxml4j.opc.PackagePart; -import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; -import org.apache.poi.util.PackageHelper; +import org.apache.poi.xdgf.usermodel.XmlVisioDocument; import org.junit.Test; public class XDGFFileHandler extends AbstractFileHandler { @@ -37,39 +29,19 @@ public class XDGFFileHandler extends AbstractFileHandler { // ignore password protected files if (POIXMLDocumentHandler.isEncrypted(stream)) return; - TestXDGFXMLDocument doc = new TestXDGFXMLDocument(stream); + XmlVisioDocument doc = new XmlVisioDocument(stream); new POIXMLDocumentHandler().handlePOIXMLDocument(doc); } - - @Override - public void handleExtracting(File file) throws Exception { - // TODO: extraction/actual operations not supported yet - } - + // a test-case to test this locally without executing the full TestAllFiles @Test public void test() throws Exception { OPCPackage pkg = OPCPackage.open("test-data/diagram/test.vsdx", PackageAccess.READ); try { - TestXDGFXMLDocument doc = new TestXDGFXMLDocument(pkg); + XmlVisioDocument doc = new XmlVisioDocument(pkg); new POIXMLDocumentHandler().handlePOIXMLDocument(doc); } finally { pkg.close(); } } - - // TODO: Get rid of this when full visio ooxml support is added - private final static class TestXDGFXMLDocument extends POIXMLDocument { - public TestXDGFXMLDocument(OPCPackage pkg) { - super(pkg, PackageRelationshipTypes.VISIO_CORE_DOCUMENT); - } - - public TestXDGFXMLDocument(InputStream is) throws IOException { - this(PackageHelper.open(is)); - } - - public List<PackagePart> getAllEmbedds() throws OpenXML4JException { - return new ArrayList<PackagePart>(); - } - } }
\ No newline at end of file diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 59ebeb10d0..906c026375 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -55,6 +55,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.OfficeXmlFileException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xslf.usermodel.XSLFRelation; import org.apache.poi.xslf.usermodel.XSLFSlideShow; @@ -172,11 +173,9 @@ public class ExtractorFactory { } if (core.size() == 0) { // Could it be a visio one? - PackageRelationshipCollection visio = - pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); - if (visio.size() == 1) { - throw new IllegalArgumentException("Text extraction not supported for Visio OOXML files"); - } + core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); + if (core.size() == 1) + return new XDGFVisioExtractor(pkg); } // Should just be a single core document, complain if not diff --git a/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java b/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java new file mode 100644 index 0000000000..c49c2121dc --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java @@ -0,0 +1,51 @@ +package org.apache.poi.xdgf.extractor; + +import java.io.IOException; + +import org.apache.poi.POIXMLDocument; +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.xdgf.usermodel.XDGFPage; +import org.apache.poi.xdgf.usermodel.XmlVisioDocument; +import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor; + +/** + * Helper class to extract text from an OOXML Visio File + */ +public class XDGFVisioExtractor extends POIXMLTextExtractor { + + protected final XmlVisioDocument document; + + public XDGFVisioExtractor(XmlVisioDocument document) { + super(document); + this.document = document; + } + + public XDGFVisioExtractor(OPCPackage openPackage) throws IOException { + this(new XmlVisioDocument(openPackage)); + } + + public String getText() { + ShapeTextVisitor visitor = new ShapeTextVisitor(); + + for (XDGFPage page: document.getPages()) { + page.getContent().visitShapes(visitor); + } + + return visitor.getText().toString(); + } + + public static void main(String [] args) throws IOException { + if (args.length < 1) { + System.err.println("Use:"); + System.err.println(" XDGFVisioExtractor <filename.vsdx>"); + System.exit(1); + } + POIXMLTextExtractor extractor = + new XDGFVisioExtractor(POIXMLDocument.openPackage( + args[0] + )); + System.out.println(extractor.getText()); + extractor.close(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java b/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java index dc42fa192d..ba460cc13c 100644 --- a/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java +++ b/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java @@ -29,6 +29,9 @@ import com.microsoft.schemas.office.visio.x2012.main.VisioDocumentType; /** * Represents the root document: /visio/document.xml + * + * You're probably actually looking for {@link XmlVisioDocument}, this + * only contains metadata about the root document in the OOXML package. */ public class XDGFDocument { diff --git a/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java b/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java index e5589c7aaa..8794874040 100644 --- a/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java +++ b/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java @@ -19,6 +19,7 @@ package org.apache.poi.xdgf.usermodel; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -107,15 +108,21 @@ public class XmlVisioDocument extends POIXMLDocument { _pages.onDocumentRead(); } + /** + * Not currently implemented + */ @Override public List<PackagePart> getAllEmbedds() throws OpenXML4JException { - throw new UnsupportedOperationException("Not implemented"); + return new ArrayList<PackagePart>(); } // // Useful public API goes here // + /** + * @return pages ordered by page number + */ public Collection<XDGFPage> getPages() { return _pages.getPageList(); } diff --git a/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java b/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java new file mode 100644 index 0000000000..4589bc8ad7 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java @@ -0,0 +1,41 @@ +package org.apache.poi.xdgf.usermodel.shape; + +import java.awt.geom.AffineTransform; + +import org.apache.poi.xdgf.usermodel.XDGFShape; + +/** + * Only visits text nodes, accumulates text content into a string + * + * The text is returned in arbitrary order, with no regards to + * the location of the text on the page. This may change in the + * future. + */ +public class ShapeTextVisitor extends ShapeVisitor { + + protected StringBuilder text = new StringBuilder(); + + public static class TextAcceptor implements ShapeVisitorAcceptor { + public boolean accept(XDGFShape shape) { + return shape.hasText(); + } + } + + protected ShapeVisitorAcceptor getAcceptor() { + return new TextAcceptor(); + } + + public void visit(XDGFShape shape, AffineTransform globalTransform, + int level) { + text.append(shape.getText().getTextContent().trim()); + text.append('\n'); + } + + /** + * Call this after visitation has completed + */ + public String getText() { + return text.toString(); + } + +} diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index 3d2d2a5b15..fba530757a 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -44,6 +44,7 @@ import org.apache.poi.openxml4j.exceptions.InvalidOperationException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; @@ -271,12 +272,13 @@ public class TestExtractorFactory { ExtractorFactory.createExtractor(vsd).getText().length() > 50 ); // Visio - vsdx - try { - ExtractorFactory.createExtractor(vsdx); - fail(); - } catch(IllegalArgumentException e) { - // Good - } + assertTrue( + ExtractorFactory.createExtractor(vsdx) + instanceof XDGFVisioExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(vsdx).getText().length() > 20 + ); // Publisher assertTrue( @@ -391,13 +393,15 @@ public class TestExtractorFactory { ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 ); // Visio - vsdx - try { - ExtractorFactory.createExtractor(new FileInputStream(vsdx)); - fail(); - } catch(IllegalArgumentException e) { - // Good - } + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(vsdx)) + instanceof XDGFVisioExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20 + ); + // Publisher assertTrue( ExtractorFactory.createExtractor(new FileInputStream(pub)) @@ -551,6 +555,15 @@ public class TestExtractorFactory { extractor.getText().length() > 120 ); extractor.close(); + + // Visio + assertTrue( + ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString())) + instanceof XDGFVisioExtractor + ); + assertTrue( + extractor.getText().length() > 20 + ); // Text try { diff --git a/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java b/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java new file mode 100644 index 0000000000..4c7459518e --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java @@ -0,0 +1,39 @@ +package org.apache.poi.xdgf.extractor; + +import java.io.IOException; + +import org.apache.poi.POIDataSamples; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.xdgf.usermodel.XmlVisioDocument; + +import junit.framework.TestCase; + +public class TestXDGFVisioExtractor extends TestCase { + + private POIDataSamples diagrams; + private OPCPackage pkg; + private XmlVisioDocument xml; + + protected void setUp() throws Exception { + diagrams = POIDataSamples.getDiagramInstance(); + + pkg = OPCPackage.open(diagrams.openResourceAsStream("test_text_extraction.vsdx")); + xml = new XmlVisioDocument(pkg); + } + + public void testGetSimpleText() throws IOException { + new XDGFVisioExtractor(xml).close(); + new XDGFVisioExtractor(pkg).close(); + + XDGFVisioExtractor extractor = new XDGFVisioExtractor(xml); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + assertEquals("Text here\nText there\nText, text, everywhere!\nRouter here\n", + text); + + extractor.close(); + } +} diff --git a/test-data/diagram/test_text_extraction.vsdx b/test-data/diagram/test_text_extraction.vsdx Binary files differnew file mode 100644 index 0000000000..39b6401772 --- /dev/null +++ b/test-data/diagram/test_text_extraction.vsdx |