git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646298 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_5_BETA2
import org.openxml4j.opc.PackageRelationshipCollection; | import org.openxml4j.opc.PackageRelationshipCollection; | ||||
import org.openxml4j.opc.PackageRelationshipTypes; | import org.openxml4j.opc.PackageRelationshipTypes; | ||||
import org.openxml4j.opc.PackagingURIHelper; | import org.openxml4j.opc.PackagingURIHelper; | ||||
import org.openxml4j.opc.internal.PackagePropertiesPart; | |||||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties; | |||||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument; | |||||
public abstract class POIXMLDocument { | public abstract class POIXMLDocument { | ||||
/** The OPC core Package Part */ | /** The OPC core Package Part */ | ||||
private PackagePart corePart; | private PackagePart corePart; | ||||
/** | |||||
* The properties of the OPC package, opened as needed | |||||
*/ | |||||
private POIXMLProperties properties; | |||||
protected POIXMLDocument() {} | protected POIXMLDocument() {} | ||||
protected POIXMLDocument(Package pkg) throws IOException { | protected POIXMLDocument(Package pkg) throws IOException { | ||||
} | } | ||||
/** | /** | ||||
* Get the core document properties (core ooxml properties). | |||||
* TODO: Replace with nice usermodel wrapper | |||||
* @deprecated To be replaced with a proper user-model style view of the properties | |||||
* Get the document properties. This gives you access to the | |||||
* core ooxml properties, and the extended ooxml properties. | |||||
*/ | */ | ||||
public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException { | |||||
PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE); | |||||
if(propsPart == null) { | |||||
return null; | |||||
public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException { | |||||
if(properties == null) { | |||||
properties = new POIXMLProperties(pkg); | |||||
} | } | ||||
return (PackagePropertiesPart)propsPart; | |||||
} | |||||
/** | |||||
* Get the extended document properties (extended ooxml properties) | |||||
* TODO: Replace with nice usermodel wrapper | |||||
* @deprecated To be replaced with a proper user-model style view of the properties | |||||
*/ | |||||
public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException { | |||||
PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE); | |||||
PropertiesDocument props = PropertiesDocument.Factory.parse( | |||||
propsPart.getInputStream()); | |||||
return props.getProperties(); | |||||
return properties; | |||||
} | } | ||||
} | } |
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi; | |||||
import java.io.IOException; | |||||
import org.apache.xmlbeans.XmlException; | |||||
import org.openxml4j.exceptions.OpenXML4JException; | |||||
import org.openxml4j.opc.Package; | |||||
import org.openxml4j.opc.PackageRelationshipCollection; | |||||
import org.openxml4j.opc.internal.PackagePropertiesPart; | |||||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties; | |||||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument; | |||||
/** | |||||
* Wrapper around the two different kinds of OOXML properties | |||||
* a document can have | |||||
*/ | |||||
public class POIXMLProperties { | |||||
private Package pkg; | |||||
private CoreProperties core; | |||||
private ExtendedProperties ext; | |||||
public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException { | |||||
this.pkg = docPackage; | |||||
// Core properties | |||||
PackageRelationshipCollection coreRel = | |||||
pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE); | |||||
if(coreRel.size() == 1) { | |||||
core = new CoreProperties( (PackagePropertiesPart) | |||||
pkg.getPart(coreRel.getRelationship(0)) ); | |||||
} else { | |||||
throw new IllegalArgumentException("A document must always have core properties defined!"); | |||||
} | |||||
// Extended properties | |||||
PackageRelationshipCollection extRel = | |||||
pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE); | |||||
if(extRel.size() == 1) { | |||||
PropertiesDocument props = PropertiesDocument.Factory.parse( | |||||
pkg.getPart( extRel.getRelationship(0) ).getInputStream() | |||||
); | |||||
ext = new ExtendedProperties(props); | |||||
} else { | |||||
ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance()); | |||||
} | |||||
} | |||||
/** | |||||
* Returns the core document properties | |||||
*/ | |||||
public CoreProperties getCoreProperties() { | |||||
return core; | |||||
} | |||||
/** | |||||
* Returns the extended document properties | |||||
*/ | |||||
public ExtendedProperties getExtendedProperties() { | |||||
return ext; | |||||
} | |||||
/** | |||||
* Writes out the ooxml properties into the supplied, | |||||
* new Package | |||||
*/ | |||||
public void write(Package pkg) { | |||||
// TODO | |||||
} | |||||
/** | |||||
* The core document properties | |||||
*/ | |||||
public class CoreProperties { | |||||
private PackagePropertiesPart part; | |||||
private CoreProperties(PackagePropertiesPart part) { | |||||
this.part = part; | |||||
} | |||||
public void setTitle(String title) { | |||||
part.setTitleProperty(title); | |||||
} | |||||
public String getTitle() { | |||||
return part.getTitleProperty().getValue(); | |||||
} | |||||
public PackagePropertiesPart getUnderlyingProperties() { | |||||
return part; | |||||
} | |||||
} | |||||
/** | |||||
* Extended document properties | |||||
*/ | |||||
public class ExtendedProperties { | |||||
private PropertiesDocument props; | |||||
private ExtendedProperties(PropertiesDocument props) { | |||||
this.props = props; | |||||
if(props.getProperties() == null) { | |||||
props.addNewProperties(); | |||||
} | |||||
} | |||||
public CTProperties getUnderlyingProperties() { | |||||
return props.getProperties(); | |||||
} | |||||
} | |||||
} |
==================================================================== */ | ==================================================================== */ | ||||
package org.apache.poi; | package org.apache.poi; | ||||
import java.io.IOException; | |||||
import org.apache.poi.POIXMLProperties.*; | |||||
import org.apache.xmlbeans.XmlException; | |||||
import org.openxml4j.exceptions.OpenXML4JException; | |||||
public abstract class POIXMLTextExtractor extends POITextExtractor { | public abstract class POIXMLTextExtractor extends POITextExtractor { | ||||
/** The POIXMLDocument that's open */ | /** The POIXMLDocument that's open */ | ||||
protected POIXMLDocument document; | protected POIXMLDocument document; | ||||
this.document = document; | this.document = document; | ||||
} | } | ||||
/** | |||||
* Returns the core document properties | |||||
*/ | |||||
public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException { | |||||
return document.getProperties().getCoreProperties(); | |||||
} | |||||
/** | |||||
* Returns the extended document properties | |||||
*/ | |||||
public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException { | |||||
return document.getProperties().getExtendedProperties(); | |||||
} | |||||
} | } |
import org.openxml4j.exceptions.OpenXML4JException; | import org.openxml4j.exceptions.OpenXML4JException; | ||||
import org.openxml4j.opc.Package; | import org.openxml4j.opc.Package; | ||||
import org.openxml4j.opc.PackagePart; | import org.openxml4j.opc.PackagePart; | ||||
import org.openxml4j.opc.PackageRelationshipCollection; | |||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; | import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; | ||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1; | import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1; | ||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles; | import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles; | ||||
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"; | public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"; | ||||
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"; | public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"; | ||||
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"; | public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"; | ||||
public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"; | |||||
private DocumentDocument wordDoc; | private DocumentDocument wordDoc; | ||||
StylesDocument.Factory.parse(parts[0].getInputStream()); | StylesDocument.Factory.parse(parts[0].getInputStream()); | ||||
return sd.getStyles(); | return sd.getStyles(); | ||||
} | } | ||||
/** | |||||
* Returns all the hyperlink relations for the file. | |||||
* You'll generally want to get the target to get | |||||
* the destination of the hyperlink | |||||
*/ | |||||
public PackageRelationshipCollection getHyperlinks() { | |||||
try { | |||||
return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE); | |||||
} catch(InvalidFormatException e) { | |||||
// Should never happen | |||||
throw new IllegalStateException(e); | |||||
} | |||||
} | |||||
} | } |
==================================================================== */ | ==================================================================== */ | ||||
package org.apache.poi.xwpf.extractor; | package org.apache.poi.xwpf.extractor; | ||||
import java.io.File; | |||||
import java.io.IOException; | import java.io.IOException; | ||||
import org.apache.poi.POIXMLDocument; | import org.apache.poi.POIXMLDocument; | ||||
import org.apache.xmlbeans.XmlException; | import org.apache.xmlbeans.XmlException; | ||||
import org.openxml4j.exceptions.OpenXML4JException; | import org.openxml4j.exceptions.OpenXML4JException; | ||||
import org.openxml4j.opc.Package; | import org.openxml4j.opc.Package; | ||||
import org.openxml4j.opc.PackageRelationship; | |||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; | import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; | ||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink; | |||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; | import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; | ||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; | import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; | ||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; | import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; | ||||
*/ | */ | ||||
public class XWPFWordExtractor extends POIXMLTextExtractor { | public class XWPFWordExtractor extends POIXMLTextExtractor { | ||||
private XWPFDocument document; | private XWPFDocument document; | ||||
private boolean fetchHyperlinks = false; | |||||
public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { | public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { | ||||
this(new XWPFDocument(container)); | this(new XWPFDocument(container)); | ||||
)); | )); | ||||
System.out.println(extractor.getText()); | System.out.println(extractor.getText()); | ||||
} | } | ||||
/** | |||||
* Should we also fetch the hyperlinks, when fetching | |||||
* the text content? Default is to only output the | |||||
* hyperlink label, and not the contents | |||||
*/ | |||||
public void setFetchHyperlinks(boolean fetch) { | |||||
fetchHyperlinks = fetch; | |||||
} | |||||
public String getText() { | public String getText() { | ||||
CTBody body = document.getDocumentBody(); | CTBody body = document.getDocumentBody(); | ||||
// Loop over paragraphs | // Loop over paragraphs | ||||
CTP[] ps = body.getPArray(); | CTP[] ps = body.getPArray(); | ||||
for (int i = 0; i < ps.length; i++) { | for (int i = 0; i < ps.length; i++) { | ||||
// Loop over ranges | |||||
// Loop over ranges and hyperlinks | |||||
// TODO - properly intersperce ranges and hyperlinks | |||||
CTR[] rs = ps[i].getRArray(); | CTR[] rs = ps[i].getRArray(); | ||||
for (int j = 0; j < rs.length; j++) { | |||||
for(int j = 0; j < rs.length; j++) { | |||||
// Loop over text runs | // Loop over text runs | ||||
CTText[] texts = rs[j].getTArray(); | CTText[] texts = rs[j].getTArray(); | ||||
for (int k = 0; k < texts.length; k++) { | for (int k = 0; k < texts.length; k++) { | ||||
); | ); | ||||
} | } | ||||
} | } | ||||
CTHyperlink[] hls = ps[i].getHyperlinkArray(); | |||||
for(CTHyperlink hl : hls) { | |||||
for(CTR r : hl.getRArray()) { | |||||
for(CTText txt : r.getTArray()) { | |||||
text.append(txt.getStringValue()); | |||||
} | |||||
} | |||||
if(fetchHyperlinks) { | |||||
String id = hl.getId(); | |||||
if(id != null) { | |||||
PackageRelationship hlRel = | |||||
document.getHyperlinks().getRelationshipByID(id); | |||||
if(hlRel != null) { | |||||
text.append(" <" + hlRel.getTargetURI().toString() + ">"); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
// New line after each paragraph. | // New line after each paragraph. | ||||
text.append("\n"); | text.append("\n"); | ||||
} | } |
if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) { | if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) { | ||||
found = true; | found = true; | ||||
} | } | ||||
System.out.println(part); | |||||
//System.out.println(part); | |||||
} | } | ||||
assertTrue(found); | assertTrue(found); | ||||
} | } | ||||
public void testMetadataBasics() throws Exception { | public void testMetadataBasics() throws Exception { | ||||
XSLFSlideShow xml = new XSLFSlideShow(sampleFile); | XSLFSlideShow xml = new XSLFSlideShow(sampleFile); | ||||
assertNotNull(xml.getCoreProperties()); | |||||
assertNotNull(xml.getExtendedProperties()); | |||||
assertNotNull(xml.getProperties().getCoreProperties()); | |||||
assertNotNull(xml.getProperties().getExtendedProperties()); | |||||
assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication()); | |||||
assertEquals(0, xml.getExtendedProperties().getCharacters()); | |||||
assertEquals(0, xml.getExtendedProperties().getLines()); | |||||
assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); | |||||
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); | |||||
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); | |||||
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); | |||||
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); | |||||
assertEquals(null, xml.getProperties().getCoreProperties().getTitle()); | |||||
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); | |||||
} | } | ||||
} | } |
XWPFDocument xml = new XWPFDocument( | XWPFDocument xml = new XWPFDocument( | ||||
POIXMLDocument.openPackage(sampleFile.toString()) | POIXMLDocument.openPackage(sampleFile.toString()) | ||||
); | ); | ||||
assertNotNull(xml.getCoreProperties()); | |||||
assertNotNull(xml.getExtendedProperties()); | |||||
assertNotNull(xml.getProperties().getCoreProperties()); | |||||
assertNotNull(xml.getProperties().getExtendedProperties()); | |||||
assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication()); | |||||
assertEquals(1315, xml.getExtendedProperties().getCharacters()); | |||||
assertEquals(10, xml.getExtendedProperties().getLines()); | |||||
assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); | |||||
assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); | |||||
assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); | |||||
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); | |||||
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); | |||||
assertEquals(null, xml.getProperties().getCoreProperties().getTitle()); | |||||
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); | |||||
} | } | ||||
public void testMetadataComplex() throws Exception { | public void testMetadataComplex() throws Exception { | ||||
XWPFDocument xml = new XWPFDocument( | XWPFDocument xml = new XWPFDocument( | ||||
POIXMLDocument.openPackage(complexFile.toString()) | POIXMLDocument.openPackage(complexFile.toString()) | ||||
); | ); | ||||
assertNotNull(xml.getCoreProperties()); | |||||
assertNotNull(xml.getExtendedProperties()); | |||||
assertNotNull(xml.getProperties().getCoreProperties()); | |||||
assertNotNull(xml.getProperties().getExtendedProperties()); | |||||
assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication()); | |||||
assertEquals(5184, xml.getExtendedProperties().getCharacters()); | |||||
assertEquals(0, xml.getExtendedProperties().getLines()); | |||||
assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); | |||||
assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); | |||||
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); | |||||
assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue()); | |||||
assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue()); | |||||
assertEquals(" ", xml.getProperties().getCoreProperties().getTitle()); | |||||
assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); | |||||
} | } | ||||
} | } |
*/ | */ | ||||
private XWPFDocument xmlB; | private XWPFDocument xmlB; | ||||
private File fileB; | private File fileB; | ||||
/** | |||||
* File with hyperlinks | |||||
*/ | |||||
private XWPFDocument xmlC; | |||||
private File fileC; | |||||
protected void setUp() throws Exception { | protected void setUp() throws Exception { | ||||
super.setUp(); | super.setUp(); | ||||
System.getProperty("HWPF.testdata.path") + | System.getProperty("HWPF.testdata.path") + | ||||
File.separator + "IllustrativeCases.docx" | File.separator + "IllustrativeCases.docx" | ||||
); | ); | ||||
fileC = new File( | |||||
System.getProperty("HWPF.testdata.path") + | |||||
File.separator + "TestDocument.docx" | |||||
); | |||||
assertTrue(fileA.exists()); | assertTrue(fileA.exists()); | ||||
assertTrue(fileB.exists()); | assertTrue(fileB.exists()); | ||||
assertTrue(fileC.exists()); | |||||
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString())); | xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString())); | ||||
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString())); | xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString())); | ||||
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString())); | |||||
} | } | ||||
/** | /** | ||||
} | } | ||||
assertEquals(79, ps); | assertEquals(79, ps); | ||||
} | } | ||||
public void testGetWithHyperlinks() throws Exception { | |||||
XWPFWordExtractor extractor = | |||||
new XWPFWordExtractor(xmlC); | |||||
extractor.getText(); | |||||
extractor.setFetchHyperlinks(true); | |||||
extractor.getText(); | |||||
// Now check contents | |||||
// TODO - fix once correctly handling contents | |||||
extractor.setFetchHyperlinks(false); | |||||
assertEquals( | |||||
// "This is a test document\nThis bit is in bold and italic\n" + | |||||
// "Back to normal\nWe have a hyperlink here, and another.\n", | |||||
"This is a test document\nThis bit is in bold and italic\n" + | |||||
"Back to normal\nWe have a here, and .hyperlinkanother\n", | |||||
extractor.getText() | |||||
); | |||||
extractor.setFetchHyperlinks(true); | |||||
assertEquals( | |||||
// "This is a test document\nThis bit is in bold and italic\n" + | |||||
// "Back to normal\nWe have a hyperlink here, and another.\n", | |||||
"This is a test document\nThis bit is in bold and italic\n" + | |||||
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n", | |||||
extractor.getText() | |||||
); | |||||
} | |||||
} | } |