git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646298 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_5_BETA2
@@ -32,9 +32,6 @@ import org.openxml4j.opc.PackageRelationship; | |||
import org.openxml4j.opc.PackageRelationshipCollection; | |||
import org.openxml4j.opc.PackageRelationshipTypes; | |||
import org.openxml4j.opc.PackagingURIHelper; | |||
import org.openxml4j.opc.internal.PackagePropertiesPart; | |||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties; | |||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument; | |||
public abstract class POIXMLDocument { | |||
@@ -48,6 +45,12 @@ public abstract class POIXMLDocument { | |||
/** The OPC core Package Part */ | |||
private PackagePart corePart; | |||
/** | |||
* The properties of the OPC package, opened as needed | |||
*/ | |||
private POIXMLProperties properties; | |||
protected POIXMLDocument() {} | |||
protected POIXMLDocument(Package pkg) throws IOException { | |||
@@ -178,28 +181,13 @@ public abstract class POIXMLDocument { | |||
} | |||
/** | |||
* Get the core document properties (core ooxml properties). | |||
* TODO: Replace with nice usermodel wrapper | |||
* @deprecated To be replaced with a proper user-model style view of the properties | |||
* Get the document properties. This gives you access to the | |||
* core ooxml properties, and the extended ooxml properties. | |||
*/ | |||
public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException { | |||
PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE); | |||
if(propsPart == null) { | |||
return null; | |||
public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException { | |||
if(properties == null) { | |||
properties = new POIXMLProperties(pkg); | |||
} | |||
return (PackagePropertiesPart)propsPart; | |||
} | |||
/** | |||
* Get the extended document properties (extended ooxml properties) | |||
* TODO: Replace with nice usermodel wrapper | |||
* @deprecated To be replaced with a proper user-model style view of the properties | |||
*/ | |||
public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException { | |||
PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE); | |||
PropertiesDocument props = PropertiesDocument.Factory.parse( | |||
propsPart.getInputStream()); | |||
return props.getProperties(); | |||
return properties; | |||
} | |||
} |
@@ -0,0 +1,124 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi; | |||
import java.io.IOException; | |||
import org.apache.xmlbeans.XmlException; | |||
import org.openxml4j.exceptions.OpenXML4JException; | |||
import org.openxml4j.opc.Package; | |||
import org.openxml4j.opc.PackageRelationshipCollection; | |||
import org.openxml4j.opc.internal.PackagePropertiesPart; | |||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties; | |||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument; | |||
/** | |||
* Wrapper around the two different kinds of OOXML properties | |||
* a document can have | |||
*/ | |||
public class POIXMLProperties { | |||
private Package pkg; | |||
private CoreProperties core; | |||
private ExtendedProperties ext; | |||
public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException { | |||
this.pkg = docPackage; | |||
// Core properties | |||
PackageRelationshipCollection coreRel = | |||
pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE); | |||
if(coreRel.size() == 1) { | |||
core = new CoreProperties( (PackagePropertiesPart) | |||
pkg.getPart(coreRel.getRelationship(0)) ); | |||
} else { | |||
throw new IllegalArgumentException("A document must always have core properties defined!"); | |||
} | |||
// Extended properties | |||
PackageRelationshipCollection extRel = | |||
pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE); | |||
if(extRel.size() == 1) { | |||
PropertiesDocument props = PropertiesDocument.Factory.parse( | |||
pkg.getPart( extRel.getRelationship(0) ).getInputStream() | |||
); | |||
ext = new ExtendedProperties(props); | |||
} else { | |||
ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance()); | |||
} | |||
} | |||
/** | |||
* Returns the core document properties | |||
*/ | |||
public CoreProperties getCoreProperties() { | |||
return core; | |||
} | |||
/** | |||
* Returns the extended document properties | |||
*/ | |||
public ExtendedProperties getExtendedProperties() { | |||
return ext; | |||
} | |||
/** | |||
* Writes out the ooxml properties into the supplied, | |||
* new Package | |||
*/ | |||
public void write(Package pkg) { | |||
// TODO | |||
} | |||
/** | |||
* The core document properties | |||
*/ | |||
public class CoreProperties { | |||
private PackagePropertiesPart part; | |||
private CoreProperties(PackagePropertiesPart part) { | |||
this.part = part; | |||
} | |||
public void setTitle(String title) { | |||
part.setTitleProperty(title); | |||
} | |||
public String getTitle() { | |||
return part.getTitleProperty().getValue(); | |||
} | |||
public PackagePropertiesPart getUnderlyingProperties() { | |||
return part; | |||
} | |||
} | |||
/** | |||
* Extended document properties | |||
*/ | |||
public class ExtendedProperties { | |||
private PropertiesDocument props; | |||
private ExtendedProperties(PropertiesDocument props) { | |||
this.props = props; | |||
if(props.getProperties() == null) { | |||
props.addNewProperties(); | |||
} | |||
} | |||
public CTProperties getUnderlyingProperties() { | |||
return props.getProperties(); | |||
} | |||
} | |||
} |
@@ -16,6 +16,12 @@ | |||
==================================================================== */ | |||
package org.apache.poi; | |||
import java.io.IOException; | |||
import org.apache.poi.POIXMLProperties.*; | |||
import org.apache.xmlbeans.XmlException; | |||
import org.openxml4j.exceptions.OpenXML4JException; | |||
public abstract class POIXMLTextExtractor extends POITextExtractor { | |||
/** The POIXMLDocument that's open */ | |||
protected POIXMLDocument document; | |||
@@ -28,4 +34,17 @@ public abstract class POIXMLTextExtractor extends POITextExtractor { | |||
this.document = document; | |||
} | |||
/** | |||
* Returns the core document properties | |||
*/ | |||
public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException { | |||
return document.getProperties().getCoreProperties(); | |||
} | |||
/** | |||
* Returns the extended document properties | |||
*/ | |||
public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException { | |||
return document.getProperties().getExtendedProperties(); | |||
} | |||
} |
@@ -24,6 +24,7 @@ import org.openxml4j.exceptions.InvalidFormatException; | |||
import org.openxml4j.exceptions.OpenXML4JException; | |||
import org.openxml4j.opc.Package; | |||
import org.openxml4j.opc.PackagePart; | |||
import org.openxml4j.opc.PackageRelationshipCollection; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles; | |||
@@ -47,6 +48,7 @@ public class XWPFDocument extends POIXMLDocument { | |||
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"; | |||
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"; | |||
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"; | |||
public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"; | |||
private DocumentDocument wordDoc; | |||
@@ -89,4 +91,18 @@ public class XWPFDocument extends POIXMLDocument { | |||
StylesDocument.Factory.parse(parts[0].getInputStream()); | |||
return sd.getStyles(); | |||
} | |||
/** | |||
* Returns all the hyperlink relations for the file. | |||
* You'll generally want to get the target to get | |||
* the destination of the hyperlink | |||
*/ | |||
public PackageRelationshipCollection getHyperlinks() { | |||
try { | |||
return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE); | |||
} catch(InvalidFormatException e) { | |||
// Should never happen | |||
throw new IllegalStateException(e); | |||
} | |||
} | |||
} |
@@ -16,7 +16,6 @@ | |||
==================================================================== */ | |||
package org.apache.poi.xwpf.extractor; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import org.apache.poi.POIXMLDocument; | |||
@@ -25,7 +24,9 @@ import org.apache.poi.xwpf.XWPFDocument; | |||
import org.apache.xmlbeans.XmlException; | |||
import org.openxml4j.exceptions.OpenXML4JException; | |||
import org.openxml4j.opc.Package; | |||
import org.openxml4j.opc.PackageRelationship; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; | |||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; | |||
@@ -35,6 +36,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; | |||
*/ | |||
public class XWPFWordExtractor extends POIXMLTextExtractor { | |||
private XWPFDocument document; | |||
private boolean fetchHyperlinks = false; | |||
public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { | |||
this(new XWPFDocument(container)); | |||
@@ -56,6 +58,15 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { | |||
)); | |||
System.out.println(extractor.getText()); | |||
} | |||
/** | |||
* Should we also fetch the hyperlinks, when fetching | |||
* the text content? Default is to only output the | |||
* hyperlink label, and not the contents | |||
*/ | |||
public void setFetchHyperlinks(boolean fetch) { | |||
fetchHyperlinks = fetch; | |||
} | |||
public String getText() { | |||
CTBody body = document.getDocumentBody(); | |||
@@ -64,9 +75,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { | |||
// Loop over paragraphs | |||
CTP[] ps = body.getPArray(); | |||
for (int i = 0; i < ps.length; i++) { | |||
// Loop over ranges | |||
// Loop over ranges and hyperlinks | |||
// TODO - properly intersperce ranges and hyperlinks | |||
CTR[] rs = ps[i].getRArray(); | |||
for (int j = 0; j < rs.length; j++) { | |||
for(int j = 0; j < rs.length; j++) { | |||
// Loop over text runs | |||
CTText[] texts = rs[j].getTArray(); | |||
for (int k = 0; k < texts.length; k++) { | |||
@@ -75,6 +87,26 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { | |||
); | |||
} | |||
} | |||
CTHyperlink[] hls = ps[i].getHyperlinkArray(); | |||
for(CTHyperlink hl : hls) { | |||
for(CTR r : hl.getRArray()) { | |||
for(CTText txt : r.getTArray()) { | |||
text.append(txt.getStringValue()); | |||
} | |||
} | |||
if(fetchHyperlinks) { | |||
String id = hl.getId(); | |||
if(id != null) { | |||
PackageRelationship hlRel = | |||
document.getHyperlinks().getRelationshipByID(id); | |||
if(hlRel != null) { | |||
text.append(" <" + hlRel.getTargetURI().toString() + ">"); | |||
} | |||
} | |||
} | |||
} | |||
// New line after each paragraph. | |||
text.append("\n"); | |||
} |
@@ -46,7 +46,7 @@ public class TestXSLFSlideShow extends TestCase { | |||
if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) { | |||
found = true; | |||
} | |||
System.out.println(part); | |||
//System.out.println(part); | |||
} | |||
assertTrue(found); | |||
} | |||
@@ -110,14 +110,14 @@ public class TestXSLFSlideShow extends TestCase { | |||
public void testMetadataBasics() throws Exception { | |||
XSLFSlideShow xml = new XSLFSlideShow(sampleFile); | |||
assertNotNull(xml.getCoreProperties()); | |||
assertNotNull(xml.getExtendedProperties()); | |||
assertNotNull(xml.getProperties().getCoreProperties()); | |||
assertNotNull(xml.getProperties().getExtendedProperties()); | |||
assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication()); | |||
assertEquals(0, xml.getExtendedProperties().getCharacters()); | |||
assertEquals(0, xml.getExtendedProperties().getLines()); | |||
assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); | |||
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); | |||
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); | |||
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); | |||
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); | |||
assertEquals(null, xml.getProperties().getCoreProperties().getTitle()); | |||
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); | |||
} | |||
} |
@@ -92,29 +92,29 @@ public class TestXWPFDocument extends TestCase { | |||
XWPFDocument xml = new XWPFDocument( | |||
POIXMLDocument.openPackage(sampleFile.toString()) | |||
); | |||
assertNotNull(xml.getCoreProperties()); | |||
assertNotNull(xml.getExtendedProperties()); | |||
assertNotNull(xml.getProperties().getCoreProperties()); | |||
assertNotNull(xml.getProperties().getExtendedProperties()); | |||
assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication()); | |||
assertEquals(1315, xml.getExtendedProperties().getCharacters()); | |||
assertEquals(10, xml.getExtendedProperties().getLines()); | |||
assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); | |||
assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); | |||
assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); | |||
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); | |||
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); | |||
assertEquals(null, xml.getProperties().getCoreProperties().getTitle()); | |||
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); | |||
} | |||
public void testMetadataComplex() throws Exception { | |||
XWPFDocument xml = new XWPFDocument( | |||
POIXMLDocument.openPackage(complexFile.toString()) | |||
); | |||
assertNotNull(xml.getCoreProperties()); | |||
assertNotNull(xml.getExtendedProperties()); | |||
assertNotNull(xml.getProperties().getCoreProperties()); | |||
assertNotNull(xml.getProperties().getExtendedProperties()); | |||
assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication()); | |||
assertEquals(5184, xml.getExtendedProperties().getCharacters()); | |||
assertEquals(0, xml.getExtendedProperties().getLines()); | |||
assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); | |||
assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); | |||
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); | |||
assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue()); | |||
assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue()); | |||
assertEquals(" ", xml.getProperties().getCoreProperties().getTitle()); | |||
assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); | |||
} | |||
} |
@@ -37,6 +37,12 @@ public class TestXWPFWordExtractor extends TestCase { | |||
*/ | |||
private XWPFDocument xmlB; | |||
private File fileB; | |||
/** | |||
* File with hyperlinks | |||
*/ | |||
private XWPFDocument xmlC; | |||
private File fileC; | |||
protected void setUp() throws Exception { | |||
super.setUp(); | |||
@@ -49,11 +55,17 @@ public class TestXWPFWordExtractor extends TestCase { | |||
System.getProperty("HWPF.testdata.path") + | |||
File.separator + "IllustrativeCases.docx" | |||
); | |||
fileC = new File( | |||
System.getProperty("HWPF.testdata.path") + | |||
File.separator + "TestDocument.docx" | |||
); | |||
assertTrue(fileA.exists()); | |||
assertTrue(fileB.exists()); | |||
assertTrue(fileC.exists()); | |||
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString())); | |||
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString())); | |||
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString())); | |||
} | |||
/** | |||
@@ -117,4 +129,32 @@ public class TestXWPFWordExtractor extends TestCase { | |||
} | |||
assertEquals(79, ps); | |||
} | |||
public void testGetWithHyperlinks() throws Exception { | |||
XWPFWordExtractor extractor = | |||
new XWPFWordExtractor(xmlC); | |||
extractor.getText(); | |||
extractor.setFetchHyperlinks(true); | |||
extractor.getText(); | |||
// Now check contents | |||
// TODO - fix once correctly handling contents | |||
extractor.setFetchHyperlinks(false); | |||
assertEquals( | |||
// "This is a test document\nThis bit is in bold and italic\n" + | |||
// "Back to normal\nWe have a hyperlink here, and another.\n", | |||
"This is a test document\nThis bit is in bold and italic\n" + | |||
"Back to normal\nWe have a here, and .hyperlinkanother\n", | |||
extractor.getText() | |||
); | |||
extractor.setFetchHyperlinks(true); | |||
assertEquals( | |||
// "This is a test document\nThis bit is in bold and italic\n" + | |||
// "Back to normal\nWe have a hyperlink here, and another.\n", | |||
"This is a test document\nThis bit is in bold and italic\n" + | |||
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n", | |||
extractor.getText() | |||
); | |||
} | |||
} |