Browse Source

Add Visio OOXML text extractor + tests

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1709361 13f79535-47bb-0310-9956-ffa450edef68
tags/REL_3_14_BETA1
Dustin Spicuzza 8 years ago
parent
commit
bc6ee96e1a

+ 1
- 1
src/integrationtest/org/apache/poi/TestAllFiles.java View File

@@ -105,7 +105,7 @@ public class TestAllFiles {
// Visio - binary
HANDLERS.put(".vsd", new HDGFFileHandler());
// Visio - ooxml (currently unsupported)
// Visio - ooxml
HANDLERS.put(".vsdm", new XDGFFileHandler());
HANDLERS.put(".vsdx", new XDGFFileHandler());
HANDLERS.put(".vssm", new XDGFFileHandler());

+ 4
- 32
src/integrationtest/org/apache/poi/stress/XDGFFileHandler.java View File

@@ -16,19 +16,11 @@
==================================================================== */
package org.apache.poi.stress;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.util.PackageHelper;
import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
import org.junit.Test;

public class XDGFFileHandler extends AbstractFileHandler {
@@ -37,39 +29,19 @@ public class XDGFFileHandler extends AbstractFileHandler {
// ignore password protected files
if (POIXMLDocumentHandler.isEncrypted(stream)) return;

TestXDGFXMLDocument doc = new TestXDGFXMLDocument(stream);
XmlVisioDocument doc = new XmlVisioDocument(stream);
new POIXMLDocumentHandler().handlePOIXMLDocument(doc);
}

@Override
public void handleExtracting(File file) throws Exception {
// TODO: extraction/actual operations not supported yet
}

// a test-case to test this locally without executing the full TestAllFiles
@Test
public void test() throws Exception {
OPCPackage pkg = OPCPackage.open("test-data/diagram/test.vsdx", PackageAccess.READ);
try {
TestXDGFXMLDocument doc = new TestXDGFXMLDocument(pkg);
XmlVisioDocument doc = new XmlVisioDocument(pkg);
new POIXMLDocumentHandler().handlePOIXMLDocument(doc);
} finally {
pkg.close();
}
}

// TODO: Get rid of this when full visio ooxml support is added
private final static class TestXDGFXMLDocument extends POIXMLDocument {
public TestXDGFXMLDocument(OPCPackage pkg) {
super(pkg, PackageRelationshipTypes.VISIO_CORE_DOCUMENT);
}

public TestXDGFXMLDocument(InputStream is) throws IOException {
this(PackageHelper.open(is));
}

public List<PackagePart> getAllEmbedds() throws OpenXML4JException {
return new ArrayList<PackagePart>();
}
}
}

+ 4
- 5
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java View File

@@ -55,6 +55,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@@ -172,11 +173,9 @@ public class ExtractorFactory {
}
if (core.size() == 0) {
// Could it be a visio one?
PackageRelationshipCollection visio =
pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (visio.size() == 1) {
throw new IllegalArgumentException("Text extraction not supported for Visio OOXML files");
}
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (core.size() == 1)
return new XDGFVisioExtractor(pkg);
}
// Should just be a single core document, complain if not

+ 51
- 0
src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java View File

@@ -0,0 +1,51 @@
package org.apache.poi.xdgf.extractor;

import java.io.IOException;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xdgf.usermodel.XDGFPage;
import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor;

/**
* Helper class to extract text from an OOXML Visio File
*/
public class XDGFVisioExtractor extends POIXMLTextExtractor {

protected final XmlVisioDocument document;
public XDGFVisioExtractor(XmlVisioDocument document) {
super(document);
this.document = document;
}

public XDGFVisioExtractor(OPCPackage openPackage) throws IOException {
this(new XmlVisioDocument(openPackage));
}

public String getText() {
ShapeTextVisitor visitor = new ShapeTextVisitor();
for (XDGFPage page: document.getPages()) {
page.getContent().visitShapes(visitor);
}
return visitor.getText().toString();
}
public static void main(String [] args) throws IOException {
if (args.length < 1) {
System.err.println("Use:");
System.err.println(" XDGFVisioExtractor <filename.vsdx>");
System.exit(1);
}
POIXMLTextExtractor extractor =
new XDGFVisioExtractor(POIXMLDocument.openPackage(
args[0]
));
System.out.println(extractor.getText());
extractor.close();
}
}

+ 3
- 0
src/ooxml/java/org/apache/poi/xdgf/usermodel/XDGFDocument.java View File

@@ -29,6 +29,9 @@ import com.microsoft.schemas.office.visio.x2012.main.VisioDocumentType;

/**
* Represents the root document: /visio/document.xml
*
* You're probably actually looking for {@link XmlVisioDocument}, this
* only contains metadata about the root document in the OOXML package.
*/
public class XDGFDocument {


+ 8
- 1
src/ooxml/java/org/apache/poi/xdgf/usermodel/XmlVisioDocument.java View File

@@ -19,6 +19,7 @@ package org.apache.poi.xdgf.usermodel;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

@@ -107,15 +108,21 @@ public class XmlVisioDocument extends POIXMLDocument {
_pages.onDocumentRead();
}

/**
* Not currently implemented
*/
@Override
public List<PackagePart> getAllEmbedds() throws OpenXML4JException {
throw new UnsupportedOperationException("Not implemented");
return new ArrayList<PackagePart>();
}

//
// Useful public API goes here
//
/**
* @return pages ordered by page number
*/
public Collection<XDGFPage> getPages() {
return _pages.getPageList();
}

+ 41
- 0
src/ooxml/java/org/apache/poi/xdgf/usermodel/shape/ShapeTextVisitor.java View File

@@ -0,0 +1,41 @@
package org.apache.poi.xdgf.usermodel.shape;

import java.awt.geom.AffineTransform;

import org.apache.poi.xdgf.usermodel.XDGFShape;

/**
* Only visits text nodes, accumulates text content into a string
*
* The text is returned in arbitrary order, with no regards to
* the location of the text on the page. This may change in the
* future.
*/
public class ShapeTextVisitor extends ShapeVisitor {

protected StringBuilder text = new StringBuilder();
public static class TextAcceptor implements ShapeVisitorAcceptor {
public boolean accept(XDGFShape shape) {
return shape.hasText();
}
}
protected ShapeVisitorAcceptor getAcceptor() {
return new TextAcceptor();
}

public void visit(XDGFShape shape, AffineTransform globalTransform,
int level) {
text.append(shape.getText().getTextContent().trim());
text.append('\n');
}

/**
* Call this after visitation has completed
*/
public String getText() {
return text.toString();
}

}

+ 25
- 12
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java View File

@@ -44,6 +44,7 @@ import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
@@ -271,12 +272,13 @@ public class TestExtractorFactory {
ExtractorFactory.createExtractor(vsd).getText().length() > 50
);
// Visio - vsdx
try {
ExtractorFactory.createExtractor(vsdx);
fail();
} catch(IllegalArgumentException e) {
// Good
}
assertTrue(
ExtractorFactory.createExtractor(vsdx)
instanceof XDGFVisioExtractor
);
assertTrue(
ExtractorFactory.createExtractor(vsdx).getText().length() > 20
);

// Publisher
assertTrue(
@@ -391,13 +393,15 @@ public class TestExtractorFactory {
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
);
// Visio - vsdx
try {
ExtractorFactory.createExtractor(new FileInputStream(vsdx));
fail();
} catch(IllegalArgumentException e) {
// Good
}
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(vsdx))
instanceof XDGFVisioExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20
);

// Publisher
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(pub))
@@ -551,6 +555,15 @@ public class TestExtractorFactory {
extractor.getText().length() > 120
);
extractor.close();
// Visio
assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
instanceof XDGFVisioExtractor
);
assertTrue(
extractor.getText().length() > 20
);

// Text
try {

+ 39
- 0
src/ooxml/testcases/org/apache/poi/xdgf/extractor/TestXDGFVisioExtractor.java View File

@@ -0,0 +1,39 @@
package org.apache.poi.xdgf.extractor;

import java.io.IOException;

import org.apache.poi.POIDataSamples;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xdgf.usermodel.XmlVisioDocument;

import junit.framework.TestCase;

public class TestXDGFVisioExtractor extends TestCase {

private POIDataSamples diagrams;
private OPCPackage pkg;
private XmlVisioDocument xml;

protected void setUp() throws Exception {
diagrams = POIDataSamples.getDiagramInstance();
pkg = OPCPackage.open(diagrams.openResourceAsStream("test_text_extraction.vsdx"));
xml = new XmlVisioDocument(pkg);
}

public void testGetSimpleText() throws IOException {
new XDGFVisioExtractor(xml).close();
new XDGFVisioExtractor(pkg).close();
XDGFVisioExtractor extractor = new XDGFVisioExtractor(xml);
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
assertEquals("Text here\nText there\nText, text, everywhere!\nRouter here\n",
text);
extractor.close();
}
}

BIN
test-data/diagram/test_text_extraction.vsdx View File


Loading…
Cancel
Save