aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDustin Spicuzza <virtuald@apache.org>2015-10-19 06:26:57 +0000
committerDustin Spicuzza <virtuald@apache.org>2015-10-19 06:26:57 +0000
commitbc6ee96e1a409cfeae97c6cd2805b2ef9c420ac7 (patch)
treec4928538118e474ba2895a91b0742563bc5b8de7
parent9716fd9a06c1005f8e9383fcdf8c6d6cb6620aa4 (diff)
downloadpoi-bc6ee96e1a409cfeae97c6cd2805b2ef9c420ac7.tar.gz
poi-bc6ee96e1a409cfeae97c6cd2805b2ef9c420ac7.zip
Add Visio OOXML text extractor + tests
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1709361 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--src/integrationtest/org/apache/poi/TestAllFiles.java2
-rw-r--r--src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java36
-rw-r--r--src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java9
-rw-r--r--src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java51
-rw-r--r--src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java3
-rw-r--r--src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java9
-rw-r--r--src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java41
-rw-r--r--src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java37
-rw-r--r--src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java39
-rw-r--r--test-data/diagram/test_text_extraction.vsdxbin0 -> 22343 bytes
10 files changed, 176 insertions, 51 deletions
diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java
index d453da27f9..6231065e7f 100644
--- a/src/integrationtest/org/apache/poi/TestAllFiles.java
+++ b/src/integrationtest/org/apache/poi/TestAllFiles.java
@@ -105,7 +105,7 @@ public class TestAllFiles {
// Visio - binary
HANDLERS.put(".vsd", new HDGFFileHandler());
- // Visio - ooxml (currently unsupported)
+ // Visio - ooxml
HANDLERS.put(".vsdm", new XDGFFileHandler());
HANDLERS.put(".vsdx", new XDGFFileHandler());
HANDLERS.put(".vssm", new XDGFFileHandler());
diff --git a/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java b/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java
index 4c4fd6088b..9b7d03f8a5 100644
--- a/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java
+++ b/src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java
@@ -16,19 +16,11 @@
==================================================================== */
package org.apache.poi.stress;
-import java.io.File;
-import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
-import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
-import org.apache.poi.util.PackageHelper;
+import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
import org.junit.Test;
public class XDGFFileHandler extends AbstractFileHandler {
@@ -37,39 +29,19 @@ public class XDGFFileHandler extends AbstractFileHandler {
// ignore password protected files
if (POIXMLDocumentHandler.isEncrypted(stream)) return;
- TestXDGFXMLDocument doc = new TestXDGFXMLDocument(stream);
+ XmlVisioDocument doc = new XmlVisioDocument(stream);
new POIXMLDocumentHandler().handlePOIXMLDocument(doc);
}
-
- @Override
- public void handleExtracting(File file) throws Exception {
- // TODO: extraction/actual operations not supported yet
- }
-
+
// a test-case to test this locally without executing the full TestAllFiles
@Test
public void test() throws Exception {
OPCPackage pkg = OPCPackage.open("test-data/diagram/test.vsdx", PackageAccess.READ);
try {
- TestXDGFXMLDocument doc = new TestXDGFXMLDocument(pkg);
+ XmlVisioDocument doc = new XmlVisioDocument(pkg);
new POIXMLDocumentHandler().handlePOIXMLDocument(doc);
} finally {
pkg.close();
}
}
-
- // TODO: Get rid of this when full visio ooxml support is added
- private final static class TestXDGFXMLDocument extends POIXMLDocument {
- public TestXDGFXMLDocument(OPCPackage pkg) {
- super(pkg, PackageRelationshipTypes.VISIO_CORE_DOCUMENT);
- }
-
- public TestXDGFXMLDocument(InputStream is) throws IOException {
- this(PackageHelper.open(is));
- }
-
- public List<PackagePart> getAllEmbedds() throws OpenXML4JException {
- return new ArrayList<PackagePart>();
- }
- }
} \ No newline at end of file
diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
index 59ebeb10d0..906c026375 100644
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@@ -55,6 +55,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@@ -172,11 +173,9 @@ public class ExtractorFactory {
}
if (core.size() == 0) {
// Could it be a visio one?
- PackageRelationshipCollection visio =
- pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
- if (visio.size() == 1) {
- throw new IllegalArgumentException("Text extraction not supported for Visio OOXML files");
- }
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+ if (core.size() == 1)
+ return new XDGFVisioExtractor(pkg);
}
// Should just be a single core document, complain if not
diff --git a/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java b/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java
new file mode 100644
index 0000000000..c49c2121dc
--- /dev/null
+++ b/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java
@@ -0,0 +1,51 @@
+package org.apache.poi.xdgf.extractor;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.xdgf.usermodel.XDGFPage;
+import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
+import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor;
+
+/**
+ * Helper class to extract text from an OOXML Visio File
+ */
+public class XDGFVisioExtractor extends POIXMLTextExtractor {
+
+ protected final XmlVisioDocument document;
+
+ public XDGFVisioExtractor(XmlVisioDocument document) {
+ super(document);
+ this.document = document;
+ }
+
+ public XDGFVisioExtractor(OPCPackage openPackage) throws IOException {
+ this(new XmlVisioDocument(openPackage));
+ }
+
+ public String getText() {
+ ShapeTextVisitor visitor = new ShapeTextVisitor();
+
+ for (XDGFPage page: document.getPages()) {
+ page.getContent().visitShapes(visitor);
+ }
+
+ return visitor.getText().toString();
+ }
+
+ public static void main(String [] args) throws IOException {
+ if (args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" XDGFVisioExtractor <filename.vsdx>");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new XDGFVisioExtractor(POIXMLDocument.openPackage(
+ args[0]
+ ));
+ System.out.println(extractor.getText());
+ extractor.close();
+ }
+}
diff --git a/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java b/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java
index dc42fa192d..ba460cc13c 100644
--- a/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java
+++ b/src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java
@@ -29,6 +29,9 @@ import com.microsoft.schemas.office.visio.x2012.main.VisioDocumentType;
/**
* Represents the root document: /visio/document.xml
+ *
+ * You're probably actually looking for {@link XmlVisioDocument}, this
+ * only contains metadata about the root document in the OOXML package.
*/
public class XDGFDocument {
diff --git a/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java b/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java
index e5589c7aaa..8794874040 100644
--- a/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java
+++ b/src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java
@@ -19,6 +19,7 @@ package org.apache.poi.xdgf.usermodel;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
@@ -107,15 +108,21 @@ public class XmlVisioDocument extends POIXMLDocument {
_pages.onDocumentRead();
}
+ /**
+ * Not currently implemented
+ */
@Override
public List<PackagePart> getAllEmbedds() throws OpenXML4JException {
- throw new UnsupportedOperationException("Not implemented");
+ return new ArrayList<PackagePart>();
}
//
// Useful public API goes here
//
+ /**
+ * @return pages ordered by page number
+ */
public Collection<XDGFPage> getPages() {
return _pages.getPageList();
}
diff --git a/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java b/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java
new file mode 100644
index 0000000000..4589bc8ad7
--- /dev/null
+++ b/src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java
@@ -0,0 +1,41 @@
+package org.apache.poi.xdgf.usermodel.shape;
+
+import java.awt.geom.AffineTransform;
+
+import org.apache.poi.xdgf.usermodel.XDGFShape;
+
+/**
+ * Only visits text nodes, accumulates text content into a string
+ *
+ * The text is returned in arbitrary order, with no regards to
+ * the location of the text on the page. This may change in the
+ * future.
+ */
+public class ShapeTextVisitor extends ShapeVisitor {
+
+ protected StringBuilder text = new StringBuilder();
+
+ public static class TextAcceptor implements ShapeVisitorAcceptor {
+ public boolean accept(XDGFShape shape) {
+ return shape.hasText();
+ }
+ }
+
+ protected ShapeVisitorAcceptor getAcceptor() {
+ return new TextAcceptor();
+ }
+
+ public void visit(XDGFShape shape, AffineTransform globalTransform,
+ int level) {
+ text.append(shape.getText().getTextContent().trim());
+ text.append('\n');
+ }
+
+ /**
+ * Call this after visitation has completed
+ */
+ public String getText() {
+ return text.toString();
+ }
+
+}
diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
index 3d2d2a5b15..fba530757a 100644
--- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
+++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
@@ -44,6 +44,7 @@ import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
@@ -271,12 +272,13 @@ public class TestExtractorFactory {
ExtractorFactory.createExtractor(vsd).getText().length() > 50
);
// Visio - vsdx
- try {
- ExtractorFactory.createExtractor(vsdx);
- fail();
- } catch(IllegalArgumentException e) {
- // Good
- }
+ assertTrue(
+ ExtractorFactory.createExtractor(vsdx)
+ instanceof XDGFVisioExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(vsdx).getText().length() > 20
+ );
// Publisher
assertTrue(
@@ -391,13 +393,15 @@ public class TestExtractorFactory {
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
);
// Visio - vsdx
- try {
- ExtractorFactory.createExtractor(new FileInputStream(vsdx));
- fail();
- } catch(IllegalArgumentException e) {
- // Good
- }
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(vsdx))
+ instanceof XDGFVisioExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20
+ );
+
// Publisher
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(pub))
@@ -551,6 +555,15 @@ public class TestExtractorFactory {
extractor.getText().length() > 120
);
extractor.close();
+
+ // Visio
+ assertTrue(
+ ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
+ instanceof XDGFVisioExtractor
+ );
+ assertTrue(
+ extractor.getText().length() > 20
+ );
// Text
try {
diff --git a/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java b/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java
new file mode 100644
index 0000000000..4c7459518e
--- /dev/null
+++ b/src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java
@@ -0,0 +1,39 @@
+package org.apache.poi.xdgf.extractor;
+
+import java.io.IOException;
+
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
+
+import junit.framework.TestCase;
+
+public class TestXDGFVisioExtractor extends TestCase {
+
+ private POIDataSamples diagrams;
+ private OPCPackage pkg;
+ private XmlVisioDocument xml;
+
+ protected void setUp() throws Exception {
+ diagrams = POIDataSamples.getDiagramInstance();
+
+ pkg = OPCPackage.open(diagrams.openResourceAsStream("test_text_extraction.vsdx"));
+ xml = new XmlVisioDocument(pkg);
+ }
+
+ public void testGetSimpleText() throws IOException {
+ new XDGFVisioExtractor(xml).close();
+ new XDGFVisioExtractor(pkg).close();
+
+ XDGFVisioExtractor extractor = new XDGFVisioExtractor(xml);
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ assertEquals("Text here\nText there\nText, text, everywhere!\nRouter here\n",
+ text);
+
+ extractor.close();
+ }
+}
diff --git a/test-data/diagram/test_text_extraction.vsdx b/test-data/diagram/test_text_extraction.vsdx
new file mode 100644
index 0000000000..39b6401772
--- /dev/null
+++ b/test-data/diagram/test_text_extraction.vsdx
Binary files differ