git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1709361 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_14_BETA1
@@ -105,7 +105,7 @@ public class TestAllFiles { | |||
// Visio - binary | |||
HANDLERS.put(".vsd", new HDGFFileHandler()); | |||
// Visio - ooxml (currently unsupported) | |||
// Visio - ooxml | |||
HANDLERS.put(".vsdm", new XDGFFileHandler()); | |||
HANDLERS.put(".vsdx", new XDGFFileHandler()); | |||
HANDLERS.put(".vssm", new XDGFFileHandler()); |
@@ -16,19 +16,11 @@ | |||
==================================================================== */ | |||
package org.apache.poi.stress; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
import org.apache.poi.POIXMLDocument; | |||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.openxml4j.opc.PackageAccess; | |||
import org.apache.poi.openxml4j.opc.PackagePart; | |||
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; | |||
import org.apache.poi.util.PackageHelper; | |||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument; | |||
import org.junit.Test; | |||
public class XDGFFileHandler extends AbstractFileHandler { | |||
@@ -37,39 +29,19 @@ public class XDGFFileHandler extends AbstractFileHandler { | |||
// ignore password protected files | |||
if (POIXMLDocumentHandler.isEncrypted(stream)) return; | |||
TestXDGFXMLDocument doc = new TestXDGFXMLDocument(stream); | |||
XmlVisioDocument doc = new XmlVisioDocument(stream); | |||
new POIXMLDocumentHandler().handlePOIXMLDocument(doc); | |||
} | |||
@Override | |||
public void handleExtracting(File file) throws Exception { | |||
// TODO: extraction/actual operations not supported yet | |||
} | |||
// a test-case to test this locally without executing the full TestAllFiles | |||
@Test | |||
public void test() throws Exception { | |||
OPCPackage pkg = OPCPackage.open("test-data/diagram/test.vsdx", PackageAccess.READ); | |||
try { | |||
TestXDGFXMLDocument doc = new TestXDGFXMLDocument(pkg); | |||
XmlVisioDocument doc = new XmlVisioDocument(pkg); | |||
new POIXMLDocumentHandler().handlePOIXMLDocument(doc); | |||
} finally { | |||
pkg.close(); | |||
} | |||
} | |||
// TODO: Get rid of this when full visio ooxml support is added | |||
private final static class TestXDGFXMLDocument extends POIXMLDocument { | |||
public TestXDGFXMLDocument(OPCPackage pkg) { | |||
super(pkg, PackageRelationshipTypes.VISIO_CORE_DOCUMENT); | |||
} | |||
public TestXDGFXMLDocument(InputStream is) throws IOException { | |||
this(PackageHelper.open(is)); | |||
} | |||
public List<PackagePart> getAllEmbedds() throws OpenXML4JException { | |||
return new ArrayList<PackagePart>(); | |||
} | |||
} | |||
} |
@@ -55,6 +55,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException; | |||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; | |||
import org.apache.poi.poifs.filesystem.OfficeXmlFileException; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; | |||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; | |||
import org.apache.poi.xslf.usermodel.XSLFRelation; | |||
import org.apache.poi.xslf.usermodel.XSLFSlideShow; | |||
@@ -172,11 +173,9 @@ public class ExtractorFactory { | |||
} | |||
if (core.size() == 0) { | |||
// Could it be a visio one? | |||
PackageRelationshipCollection visio = | |||
pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); | |||
if (visio.size() == 1) { | |||
throw new IllegalArgumentException("Text extraction not supported for Visio OOXML files"); | |||
} | |||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); | |||
if (core.size() == 1) | |||
return new XDGFVisioExtractor(pkg); | |||
} | |||
// Should just be a single core document, complain if not |
@@ -0,0 +1,51 @@ | |||
package org.apache.poi.xdgf.extractor; | |||
import java.io.IOException; | |||
import org.apache.poi.POIXMLDocument; | |||
import org.apache.poi.POIXMLTextExtractor; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.xdgf.usermodel.XDGFPage; | |||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument; | |||
import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor; | |||
/** | |||
* Helper class to extract text from an OOXML Visio File | |||
*/ | |||
public class XDGFVisioExtractor extends POIXMLTextExtractor { | |||
protected final XmlVisioDocument document; | |||
public XDGFVisioExtractor(XmlVisioDocument document) { | |||
super(document); | |||
this.document = document; | |||
} | |||
public XDGFVisioExtractor(OPCPackage openPackage) throws IOException { | |||
this(new XmlVisioDocument(openPackage)); | |||
} | |||
public String getText() { | |||
ShapeTextVisitor visitor = new ShapeTextVisitor(); | |||
for (XDGFPage page: document.getPages()) { | |||
page.getContent().visitShapes(visitor); | |||
} | |||
return visitor.getText().toString(); | |||
} | |||
public static void main(String [] args) throws IOException { | |||
if (args.length < 1) { | |||
System.err.println("Use:"); | |||
System.err.println(" XDGFVisioExtractor <filename.vsdx>"); | |||
System.exit(1); | |||
} | |||
POIXMLTextExtractor extractor = | |||
new XDGFVisioExtractor(POIXMLDocument.openPackage( | |||
args[0] | |||
)); | |||
System.out.println(extractor.getText()); | |||
extractor.close(); | |||
} | |||
} |
@@ -29,6 +29,9 @@ import com.microsoft.schemas.office.visio.x2012.main.VisioDocumentType; | |||
/** | |||
* Represents the root document: /visio/document.xml | |||
* | |||
* You're probably actually looking for {@link XmlVisioDocument}, this | |||
* only contains metadata about the root document in the OOXML package. | |||
*/ | |||
public class XDGFDocument { | |||
@@ -19,6 +19,7 @@ package org.apache.poi.xdgf.usermodel; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.util.ArrayList; | |||
import java.util.Collection; | |||
import java.util.List; | |||
@@ -107,15 +108,21 @@ public class XmlVisioDocument extends POIXMLDocument { | |||
_pages.onDocumentRead(); | |||
} | |||
/** | |||
* Not currently implemented | |||
*/ | |||
@Override | |||
public List<PackagePart> getAllEmbedds() throws OpenXML4JException { | |||
throw new UnsupportedOperationException("Not implemented"); | |||
return new ArrayList<PackagePart>(); | |||
} | |||
// | |||
// Useful public API goes here | |||
// | |||
/** | |||
* @return pages ordered by page number | |||
*/ | |||
public Collection<XDGFPage> getPages() { | |||
return _pages.getPageList(); | |||
} |
@@ -0,0 +1,41 @@ | |||
package org.apache.poi.xdgf.usermodel.shape; | |||
import java.awt.geom.AffineTransform; | |||
import org.apache.poi.xdgf.usermodel.XDGFShape; | |||
/** | |||
* Only visits text nodes, accumulates text content into a string | |||
* | |||
* The text is returned in arbitrary order, with no regards to | |||
* the location of the text on the page. This may change in the | |||
* future. | |||
*/ | |||
public class ShapeTextVisitor extends ShapeVisitor { | |||
protected StringBuilder text = new StringBuilder(); | |||
public static class TextAcceptor implements ShapeVisitorAcceptor { | |||
public boolean accept(XDGFShape shape) { | |||
return shape.hasText(); | |||
} | |||
} | |||
protected ShapeVisitorAcceptor getAcceptor() { | |||
return new TextAcceptor(); | |||
} | |||
public void visit(XDGFShape shape, AffineTransform globalTransform, | |||
int level) { | |||
text.append(shape.getText().getTextContent().trim()); | |||
text.append('\n'); | |||
} | |||
/** | |||
* Call this after visitation has completed | |||
*/ | |||
public String getText() { | |||
return text.toString(); | |||
} | |||
} |
@@ -44,6 +44,7 @@ import org.apache.poi.openxml4j.exceptions.InvalidOperationException; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.openxml4j.opc.PackageAccess; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; | |||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; | |||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; | |||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor; | |||
@@ -271,12 +272,13 @@ public class TestExtractorFactory { | |||
ExtractorFactory.createExtractor(vsd).getText().length() > 50 | |||
); | |||
// Visio - vsdx | |||
try { | |||
ExtractorFactory.createExtractor(vsdx); | |||
fail(); | |||
} catch(IllegalArgumentException e) { | |||
// Good | |||
} | |||
assertTrue( | |||
ExtractorFactory.createExtractor(vsdx) | |||
instanceof XDGFVisioExtractor | |||
); | |||
assertTrue( | |||
ExtractorFactory.createExtractor(vsdx).getText().length() > 20 | |||
); | |||
// Publisher | |||
assertTrue( | |||
@@ -391,13 +393,15 @@ public class TestExtractorFactory { | |||
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 | |||
); | |||
// Visio - vsdx | |||
try { | |||
ExtractorFactory.createExtractor(new FileInputStream(vsdx)); | |||
fail(); | |||
} catch(IllegalArgumentException e) { | |||
// Good | |||
} | |||
assertTrue( | |||
ExtractorFactory.createExtractor(new FileInputStream(vsdx)) | |||
instanceof XDGFVisioExtractor | |||
); | |||
assertTrue( | |||
ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20 | |||
); | |||
// Publisher | |||
assertTrue( | |||
ExtractorFactory.createExtractor(new FileInputStream(pub)) | |||
@@ -551,6 +555,15 @@ public class TestExtractorFactory { | |||
extractor.getText().length() > 120 | |||
); | |||
extractor.close(); | |||
// Visio | |||
assertTrue( | |||
ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString())) | |||
instanceof XDGFVisioExtractor | |||
); | |||
assertTrue( | |||
extractor.getText().length() > 20 | |||
); | |||
// Text | |||
try { |
@@ -0,0 +1,39 @@ | |||
package org.apache.poi.xdgf.extractor; | |||
import java.io.IOException; | |||
import org.apache.poi.POIDataSamples; | |||
import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument; | |||
import junit.framework.TestCase; | |||
public class TestXDGFVisioExtractor extends TestCase { | |||
private POIDataSamples diagrams; | |||
private OPCPackage pkg; | |||
private XmlVisioDocument xml; | |||
protected void setUp() throws Exception { | |||
diagrams = POIDataSamples.getDiagramInstance(); | |||
pkg = OPCPackage.open(diagrams.openResourceAsStream("test_text_extraction.vsdx")); | |||
xml = new XmlVisioDocument(pkg); | |||
} | |||
public void testGetSimpleText() throws IOException { | |||
new XDGFVisioExtractor(xml).close(); | |||
new XDGFVisioExtractor(pkg).close(); | |||
XDGFVisioExtractor extractor = new XDGFVisioExtractor(xml); | |||
extractor.getText(); | |||
String text = extractor.getText(); | |||
assertTrue(text.length() > 0); | |||
assertEquals("Text here\nText there\nText, text, everywhere!\nRouter here\n", | |||
text); | |||
extractor.close(); | |||
} | |||
} |