From: Nick Burch Date: Sat, 8 Mar 2008 18:09:07 +0000 (+0000) Subject: Start to rename hslf and hwpf in ooxml to xslf and xwpf X-Git-Tag: REL_3_5_BETA2~197 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f8686cb8b622c5047db943704958efd508603171;p=poi.git Start to rename hslf and hwpf in ooxml to xslf and xwpf git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@635031 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/ooxml/java/org/apache/poi/hslf/HSLFXML.java b/src/ooxml/java/org/apache/poi/hslf/HSLFXML.java deleted file mode 100644 index 568cb80aa0..0000000000 --- a/src/ooxml/java/org/apache/poi/hslf/HSLFXML.java +++ /dev/null @@ -1,148 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hslf; - -import java.io.IOException; - -import org.apache.poi.hxf.HXFDocument; -import org.apache.xmlbeans.XmlException; -import org.openxml4j.exceptions.InvalidFormatException; -import org.openxml4j.exceptions.OpenXML4JException; -import org.openxml4j.opc.Package; -import org.openxml4j.opc.PackagePart; -import org.openxml4j.opc.PackageRelationshipCollection; -import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide; -import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMaster; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdList; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry; -import org.openxmlformats.schemas.presentationml.x2006.main.NotesDocument; -import org.openxmlformats.schemas.presentationml.x2006.main.PresentationDocument; -import org.openxmlformats.schemas.presentationml.x2006.main.SldDocument; -import org.openxmlformats.schemas.presentationml.x2006.main.SldMasterDocument; - -/** - * Experimental class to do low level processing - * of pptx files. - * - * If you are using these low level classes, then you - * will almost certainly need to refer to the OOXML - * specifications from - * http://www.ecma-international.org/publications/standards/Ecma-376.htm - * - * WARNING - APIs expected to change rapidly - */ -public class HSLFXML extends HXFDocument { - public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"; - public static final String NOTES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml"; - public static final String SLIDE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml"; - public static final String SLIDE_LAYOUT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout"; - public static final String NOTES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide"; - - private PresentationDocument presentationDoc; - - public HSLFXML(Package container) throws OpenXML4JException, IOException, XmlException { - super(container, MAIN_CONTENT_TYPE); - - presentationDoc = - PresentationDocument.Factory.parse(basePart.getInputStream()); - } - - /** - * Returns the low level presentation base object - */ - public CTPresentation getPresentation() { - return presentationDoc.getPresentation(); - } - - /** - * Returns the references from the presentation to its - * slides. - * You'll need these to figure out the slide ordering, - * and to get at the actual slides themselves - */ - public CTSlideIdList getSlideReferences() { - return getPresentation().getSldIdLst(); - } - /** - * Returns the references from the presentation to its - * slide masters. - * You'll need these to get at the actual slide - * masters themselves - */ - public CTSlideMasterIdList getSlideMasterReferences() { - return getPresentation().getSldMasterIdLst(); - } - - /** - * Returns the low level slide master object from - * the supplied slide master reference - */ - public CTSlideMaster getSlideMaster(CTSlideMasterIdListEntry master) throws IOException, XmlException { - PackagePart masterPart = - getRelatedPackagePart(master.getId2()); - SldMasterDocument masterDoc = - SldMasterDocument.Factory.parse(masterPart.getInputStream()); - return masterDoc.getSldMaster(); - } - - /** - * Returns the low level slide object from - * the supplied slide reference - */ - public CTSlide getSlide(CTSlideIdListEntry slide) throws IOException, XmlException { - PackagePart slidePart = - getRelatedPackagePart(slide.getId2()); - SldDocument slideDoc = - SldDocument.Factory.parse(slidePart.getInputStream()); - return slideDoc.getSld(); - } - - /** - * Returns the low level notes object for the given - * slide, as found from the supplied slide reference - */ - public CTNotesSlide getNotes(CTSlideIdListEntry slide) throws IOException, XmlException { - PackagePart slidePart = - getRelatedPackagePart(slide.getId2()); - - PackageRelationshipCollection notes; - try { - notes = slidePart.getRelationshipsByType(NOTES_RELATION_TYPE); - } catch(InvalidFormatException e) { - throw new IllegalStateException(e); - } - - if(notes.size() == 0) { - // No notes for this slide - return null; - } - if(notes.size() > 1) { - throw new IllegalStateException("Expecting 0 or 1 notes for a slide, but found " + notes.size()); - } - - PackagePart notesPart = - getPackagePart(notes.getRelationship(0)); - NotesDocument notesDoc = - NotesDocument.Factory.parse(notesPart.getInputStream()); - - return notesDoc.getNotes(); - } -} diff --git a/src/ooxml/java/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java b/src/ooxml/java/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java deleted file mode 100644 index 1d4b1a2bd4..0000000000 --- a/src/ooxml/java/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java +++ /dev/null @@ -1,139 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hslf.extractor; - -import java.io.File; -import java.io.IOException; - -import org.apache.poi.POIXMLTextExtractor; -import org.apache.poi.hslf.HSLFXML; -import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow; -import org.apache.poi.hxf.HXFDocument; -import org.apache.xmlbeans.XmlException; -import org.openxml4j.exceptions.OpenXML4JException; -import org.openxml4j.opc.Package; -import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun; -import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; -import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; -import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape; -import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide; -import org.openxmlformats.schemas.presentationml.x2006.main.CTShape; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; - -public class HXFPowerPointExtractor extends POIXMLTextExtractor { - private HSLFXMLSlideShow slideshow; - private boolean slidesByDefault = true; - private boolean notesByDefault = false; - - public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException { - this(new HSLFXMLSlideShow( - new HSLFXML(container) - )); - } - public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) { - super(slideshow); - this.slideshow = slideshow; - } - - public static void main(String[] args) throws Exception { - if(args.length < 1) { - System.err.println("Use:"); - System.err.println(" HXFPowerPointExtractor "); - System.exit(1); - } - POIXMLTextExtractor extractor = - new HXFPowerPointExtractor(HXFDocument.openPackage( - new File(args[0]) - )); - System.out.println(extractor.getText()); - } - - /** - * Should a call to getText() return slide text? - * Default is yes - */ - public void setSlidesByDefault(boolean slidesByDefault) { - this.slidesByDefault = slidesByDefault; - } - /** - * Should a call to getText() return notes text? - * Default is no - */ - public void setNotesByDefault(boolean notesByDefault) { - this.notesByDefault = notesByDefault; - } - - /** - * Gets the slide text, but not the notes text - */ - public String getText() { - return getText(slidesByDefault, notesByDefault); - } - - /** - * Gets the requested text from the file - * @param slideText Should we retrieve text from slides? - * @param notesText Should we retrieve text from notes? - */ - public String getText(boolean slideText, boolean notesText) { - StringBuffer text = new StringBuffer(); - - CTSlideIdListEntry[] slideRefs = - slideshow._getHSLFXML().getSlideReferences().getSldIdArray(); - for (int i = 0; i < slideRefs.length; i++) { - try { - CTSlide slide = - slideshow._getHSLFXML().getSlide(slideRefs[i]); - CTNotesSlide notes = - slideshow._getHSLFXML().getNotes(slideRefs[i]); - - if(slideText) { - extractText(slide.getCSld().getSpTree(), text); - } - if(notesText && notes != null) { - extractText(notes.getCSld().getSpTree(), text); - } - } catch(Exception e) { - throw new RuntimeException(e); - } - } - - return text.toString(); - } - - private void extractText(CTGroupShape gs, StringBuffer text) { - CTShape[] shapes = gs.getSpArray(); - for (int i = 0; i < shapes.length; i++) { - CTTextBody textBody = - shapes[i].getTxBody(); - if(textBody != null) { - CTTextParagraph[] paras = - textBody.getPArray(); - for (int j = 0; j < paras.length; j++) { - CTRegularTextRun[] textRuns = - paras[j].getRArray(); - for (int k = 0; k < textRuns.length; k++) { - text.append( textRuns[k].getT() ); - } - // End each paragraph with a new line - text.append("\n"); - } - } - } - } -} diff --git a/src/ooxml/java/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java b/src/ooxml/java/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java deleted file mode 100644 index b8a5fcde35..0000000000 --- a/src/ooxml/java/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java +++ /dev/null @@ -1,39 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hslf.usermodel; - -import org.apache.poi.POIXMLDocument; -import org.apache.poi.hslf.HSLFXML; - -/** - * High level representation of a ooxml slideshow. - * This is the first object most users will construct whether - * they are reading or writing a slideshow. It is also the - * top level object for creating new slides/etc. - */ -public class HSLFXMLSlideShow extends POIXMLDocument { - private org.apache.poi.hslf.HSLFXML hslfXML; - - public HSLFXMLSlideShow(HSLFXML xml) { - super(xml); - this.hslfXML = xml; - } - - public HSLFXML _getHSLFXML() { - return hslfXML; - } -} diff --git a/src/ooxml/java/org/apache/poi/hwpf/HWPFXML.java b/src/ooxml/java/org/apache/poi/hwpf/HWPFXML.java deleted file mode 100644 index 66bba7ee1b..0000000000 --- a/src/ooxml/java/org/apache/poi/hwpf/HWPFXML.java +++ /dev/null @@ -1,92 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf; - -import java.io.IOException; - -import org.apache.poi.hxf.HXFDocument; -import org.apache.xmlbeans.XmlException; -import org.openxml4j.exceptions.InvalidFormatException; -import org.openxml4j.exceptions.OpenXML4JException; -import org.openxml4j.opc.Package; -import org.openxml4j.opc.PackagePart; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument; - -/** - * Experimental class to do low level processing - * of docx files. - * - * If you are using these low level classes, then you - * will almost certainly need to refer to the OOXML - * specifications from - * http://www.ecma-international.org/publications/standards/Ecma-376.htm - * - * WARNING - APIs expected to change rapidly - */ -public class HWPFXML extends HXFDocument { - public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"; - public static final String FOOTER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"; - public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"; - public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"; - public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"; - - private DocumentDocument wordDoc; - - public HWPFXML(Package container) throws OpenXML4JException, IOException, XmlException { - super(container, MAIN_CONTENT_TYPE); - - wordDoc = - DocumentDocument.Factory.parse(basePart.getInputStream()); - } - - /** - * Returns the low level document base object - */ - public CTDocument1 getDocument() { - return wordDoc.getDocument(); - } - - /** - * Returns the low level body of the document - */ - public CTBody getDocumentBody() { - return getDocument().getBody(); - } - - /** - * Returns the styles object used - */ - public CTStyles getStyle() throws XmlException, IOException { - PackagePart[] parts; - try { - parts = getRelatedByType(STYLES_RELATION_TYPE); - } catch(InvalidFormatException e) { - throw new IllegalStateException(e); - } - if(parts.length != 1) { - throw new IllegalStateException("Expecting one Styles document part, but found " + parts.length); - } - - StylesDocument sd = - StylesDocument.Factory.parse(parts[0].getInputStream()); - return sd.getStyles(); - } -} diff --git a/src/ooxml/java/org/apache/poi/hwpf/extractor/HXFWordExtractor.java b/src/ooxml/java/org/apache/poi/hwpf/extractor/HXFWordExtractor.java deleted file mode 100644 index a4427e49ec..0000000000 --- a/src/ooxml/java/org/apache/poi/hwpf/extractor/HXFWordExtractor.java +++ /dev/null @@ -1,87 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import java.io.File; -import java.io.IOException; - -import org.apache.poi.POIXMLTextExtractor; -import org.apache.poi.hwpf.HWPFXML; -import org.apache.poi.hwpf.usermodel.HWPFXMLDocument; -import org.apache.poi.hxf.HXFDocument; -import org.apache.xmlbeans.XmlException; -import org.openxml4j.exceptions.OpenXML4JException; -import org.openxml4j.opc.Package; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; - -/** - * Helper class to extract text from an OOXML Word file - */ -public class HXFWordExtractor extends POIXMLTextExtractor { - private HWPFXMLDocument document; - - public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { - this(new HWPFXMLDocument( - new HWPFXML(container) - )); - } - public HXFWordExtractor(HWPFXMLDocument document) { - super(document); - this.document = document; - } - - public static void main(String[] args) throws Exception { - if(args.length < 1) { - System.err.println("Use:"); - System.err.println(" HXFWordExtractor "); - System.exit(1); - } - POIXMLTextExtractor extractor = - new HXFWordExtractor(HXFDocument.openPackage( - new File(args[0]) - )); - System.out.println(extractor.getText()); - } - - public String getText() { - CTBody body = document._getHWPFXML().getDocumentBody(); - StringBuffer text = new StringBuffer(); - - // Loop over paragraphs - CTP[] ps = body.getPArray(); - for (int i = 0; i < ps.length; i++) { - // Loop over ranges - CTR[] rs = ps[i].getRArray(); - for (int j = 0; j < rs.length; j++) { - // Loop over text runs - CTText[] texts = rs[j].getTArray(); - for (int k = 0; k < texts.length; k++) { - text.append( - texts[k].getStringValue() - ); - } - } - // New line after each paragraph. - text.append("\n"); - } - - return text.toString(); - } -} diff --git a/src/ooxml/java/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java b/src/ooxml/java/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java deleted file mode 100644 index 64597e83dc..0000000000 --- a/src/ooxml/java/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java +++ /dev/null @@ -1,36 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.usermodel; - -import org.apache.poi.POIXMLDocument; -import org.apache.poi.hwpf.HWPFXML; - -/** - * High level representation of a ooxml text document. - */ -public class HWPFXMLDocument extends POIXMLDocument { - private HWPFXML hwpfXML; - - public HWPFXMLDocument(HWPFXML xml) { - super(xml); - this.hwpfXML = xml; - } - - public HWPFXML _getHWPFXML() { - return hwpfXML; - } -} diff --git a/src/ooxml/java/org/apache/poi/xslf/HSLFXML.java b/src/ooxml/java/org/apache/poi/xslf/HSLFXML.java new file mode 100644 index 0000000000..568cb80aa0 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xslf/HSLFXML.java @@ -0,0 +1,148 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hslf; + +import java.io.IOException; + +import org.apache.poi.hxf.HXFDocument; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.InvalidFormatException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxml4j.opc.PackagePart; +import org.openxml4j.opc.PackageRelationshipCollection; +import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide; +import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMaster; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdList; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry; +import org.openxmlformats.schemas.presentationml.x2006.main.NotesDocument; +import org.openxmlformats.schemas.presentationml.x2006.main.PresentationDocument; +import org.openxmlformats.schemas.presentationml.x2006.main.SldDocument; +import org.openxmlformats.schemas.presentationml.x2006.main.SldMasterDocument; + +/** + * Experimental class to do low level processing + * of pptx files. + * + * If you are using these low level classes, then you + * will almost certainly need to refer to the OOXML + * specifications from + * http://www.ecma-international.org/publications/standards/Ecma-376.htm + * + * WARNING - APIs expected to change rapidly + */ +public class HSLFXML extends HXFDocument { + public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"; + public static final String NOTES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml"; + public static final String SLIDE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml"; + public static final String SLIDE_LAYOUT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout"; + public static final String NOTES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide"; + + private PresentationDocument presentationDoc; + + public HSLFXML(Package container) throws OpenXML4JException, IOException, XmlException { + super(container, MAIN_CONTENT_TYPE); + + presentationDoc = + PresentationDocument.Factory.parse(basePart.getInputStream()); + } + + /** + * Returns the low level presentation base object + */ + public CTPresentation getPresentation() { + return presentationDoc.getPresentation(); + } + + /** + * Returns the references from the presentation to its + * slides. + * You'll need these to figure out the slide ordering, + * and to get at the actual slides themselves + */ + public CTSlideIdList getSlideReferences() { + return getPresentation().getSldIdLst(); + } + /** + * Returns the references from the presentation to its + * slide masters. + * You'll need these to get at the actual slide + * masters themselves + */ + public CTSlideMasterIdList getSlideMasterReferences() { + return getPresentation().getSldMasterIdLst(); + } + + /** + * Returns the low level slide master object from + * the supplied slide master reference + */ + public CTSlideMaster getSlideMaster(CTSlideMasterIdListEntry master) throws IOException, XmlException { + PackagePart masterPart = + getRelatedPackagePart(master.getId2()); + SldMasterDocument masterDoc = + SldMasterDocument.Factory.parse(masterPart.getInputStream()); + return masterDoc.getSldMaster(); + } + + /** + * Returns the low level slide object from + * the supplied slide reference + */ + public CTSlide getSlide(CTSlideIdListEntry slide) throws IOException, XmlException { + PackagePart slidePart = + getRelatedPackagePart(slide.getId2()); + SldDocument slideDoc = + SldDocument.Factory.parse(slidePart.getInputStream()); + return slideDoc.getSld(); + } + + /** + * Returns the low level notes object for the given + * slide, as found from the supplied slide reference + */ + public CTNotesSlide getNotes(CTSlideIdListEntry slide) throws IOException, XmlException { + PackagePart slidePart = + getRelatedPackagePart(slide.getId2()); + + PackageRelationshipCollection notes; + try { + notes = slidePart.getRelationshipsByType(NOTES_RELATION_TYPE); + } catch(InvalidFormatException e) { + throw new IllegalStateException(e); + } + + if(notes.size() == 0) { + // No notes for this slide + return null; + } + if(notes.size() > 1) { + throw new IllegalStateException("Expecting 0 or 1 notes for a slide, but found " + notes.size()); + } + + PackagePart notesPart = + getPackagePart(notes.getRelationship(0)); + NotesDocument notesDoc = + NotesDocument.Factory.parse(notesPart.getInputStream()); + + return notesDoc.getNotes(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xslf/extractor/HXFPowerPointExtractor.java b/src/ooxml/java/org/apache/poi/xslf/extractor/HXFPowerPointExtractor.java new file mode 100644 index 0000000000..bfa59e32cd --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xslf/extractor/HXFPowerPointExtractor.java @@ -0,0 +1,139 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hslf.extractor; + +import java.io.File; +import java.io.IOException; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hslf.HSLFXML; +import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow; +import org.apache.poi.hxf.HXFDocument; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun; +import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; +import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; +import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape; +import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide; +import org.openxmlformats.schemas.presentationml.x2006.main.CTShape; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; + +public class HXFPowerPointExtractor extends POIXMLTextExtractor { + private HSLFXMLSlideShow slideshow; + private boolean slidesByDefault = true; + private boolean notesByDefault = false; + + public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException { + this(new HSLFXMLSlideShow( + new XSLFXML(container) + )); + } + public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) { + super(slideshow); + this.slideshow = slideshow; + } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" HXFPowerPointExtractor "); + System.exit(1); + } + POIXMLTextExtractor extractor = + new HXFPowerPointExtractor(HXFDocument.openPackage( + new File(args[0]) + )); + System.out.println(extractor.getText()); + } + + /** + * Should a call to getText() return slide text? + * Default is yes + */ + public void setSlidesByDefault(boolean slidesByDefault) { + this.slidesByDefault = slidesByDefault; + } + /** + * Should a call to getText() return notes text? + * Default is no + */ + public void setNotesByDefault(boolean notesByDefault) { + this.notesByDefault = notesByDefault; + } + + /** + * Gets the slide text, but not the notes text + */ + public String getText() { + return getText(slidesByDefault, notesByDefault); + } + + /** + * Gets the requested text from the file + * @param slideText Should we retrieve text from slides? + * @param notesText Should we retrieve text from notes? + */ + public String getText(boolean slideText, boolean notesText) { + StringBuffer text = new StringBuffer(); + + CTSlideIdListEntry[] slideRefs = + slideshow._getHSLFXML().getSlideReferences().getSldIdArray(); + for (int i = 0; i < slideRefs.length; i++) { + try { + CTSlide slide = + slideshow._getHSLFXML().getSlide(slideRefs[i]); + CTNotesSlide notes = + slideshow._getHSLFXML().getNotes(slideRefs[i]); + + if(slideText) { + extractText(slide.getCSld().getSpTree(), text); + } + if(notesText && notes != null) { + extractText(notes.getCSld().getSpTree(), text); + } + } catch(Exception e) { + throw new RuntimeException(e); + } + } + + return text.toString(); + } + + private void extractText(CTGroupShape gs, StringBuffer text) { + CTShape[] shapes = gs.getSpArray(); + for (int i = 0; i < shapes.length; i++) { + CTTextBody textBody = + shapes[i].getTxBody(); + if(textBody != null) { + CTTextParagraph[] paras = + textBody.getPArray(); + for (int j = 0; j < paras.length; j++) { + CTRegularTextRun[] textRuns = + paras[j].getRArray(); + for (int k = 0; k < textRuns.length; k++) { + text.append( textRuns[k].getT() ); + } + // End each paragraph with a new line + text.append("\n"); + } + } + } + } +} diff --git a/src/ooxml/java/org/apache/poi/xslf/usermodel/HSLFXMLSlideShow.java b/src/ooxml/java/org/apache/poi/xslf/usermodel/HSLFXMLSlideShow.java new file mode 100644 index 0000000000..f4360bfff5 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/HSLFXMLSlideShow.java @@ -0,0 +1,39 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hslf.usermodel; + +import org.apache.poi.POIXMLDocument; +import org.apache.poi.hslf.HSLFXML; + +/** + * High level representation of a ooxml slideshow. + * This is the first object most users will construct whether + * they are reading or writing a slideshow. It is also the + * top level object for creating new slides/etc. + */ +public class HSLFXMLSlideShow extends POIXMLDocument { + private org.apache.poi.hslf.XSLFXML hslfXML; + + public HSLFXMLSlideShow(XSLFXML xml) { + super(xml); + this.hslfXML = xml; + } + + public XSLFXML _getHSLFXML() { + return hslfXML; + } +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/HWPFXML.java b/src/ooxml/java/org/apache/poi/xwpf/HWPFXML.java new file mode 100644 index 0000000000..66bba7ee1b --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/HWPFXML.java @@ -0,0 +1,92 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf; + +import java.io.IOException; + +import org.apache.poi.hxf.HXFDocument; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.InvalidFormatException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxml4j.opc.PackagePart; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument; + +/** + * Experimental class to do low level processing + * of docx files. + * + * If you are using these low level classes, then you + * will almost certainly need to refer to the OOXML + * specifications from + * http://www.ecma-international.org/publications/standards/Ecma-376.htm + * + * WARNING - APIs expected to change rapidly + */ +public class HWPFXML extends HXFDocument { + public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"; + public static final String FOOTER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml"; + public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"; + public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"; + public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"; + + private DocumentDocument wordDoc; + + public HWPFXML(Package container) throws OpenXML4JException, IOException, XmlException { + super(container, MAIN_CONTENT_TYPE); + + wordDoc = + DocumentDocument.Factory.parse(basePart.getInputStream()); + } + + /** + * Returns the low level document base object + */ + public CTDocument1 getDocument() { + return wordDoc.getDocument(); + } + + /** + * Returns the low level body of the document + */ + public CTBody getDocumentBody() { + return getDocument().getBody(); + } + + /** + * Returns the styles object used + */ + public CTStyles getStyle() throws XmlException, IOException { + PackagePart[] parts; + try { + parts = getRelatedByType(STYLES_RELATION_TYPE); + } catch(InvalidFormatException e) { + throw new IllegalStateException(e); + } + if(parts.length != 1) { + throw new IllegalStateException("Expecting one Styles document part, but found " + parts.length); + } + + StylesDocument sd = + StylesDocument.Factory.parse(parts[0].getInputStream()); + return sd.getStyles(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/HXFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/HXFWordExtractor.java new file mode 100644 index 0000000000..a4427e49ec --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/HXFWordExtractor.java @@ -0,0 +1,87 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.File; +import java.io.IOException; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hwpf.HWPFXML; +import org.apache.poi.hwpf.usermodel.HWPFXMLDocument; +import org.apache.poi.hxf.HXFDocument; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; + +/** + * Helper class to extract text from an OOXML Word file + */ +public class HXFWordExtractor extends POIXMLTextExtractor { + private HWPFXMLDocument document; + + public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { + this(new HWPFXMLDocument( + new HWPFXML(container) + )); + } + public HXFWordExtractor(HWPFXMLDocument document) { + super(document); + this.document = document; + } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" HXFWordExtractor "); + System.exit(1); + } + POIXMLTextExtractor extractor = + new HXFWordExtractor(HXFDocument.openPackage( + new File(args[0]) + )); + System.out.println(extractor.getText()); + } + + public String getText() { + CTBody body = document._getHWPFXML().getDocumentBody(); + StringBuffer text = new StringBuffer(); + + // Loop over paragraphs + CTP[] ps = body.getPArray(); + for (int i = 0; i < ps.length; i++) { + // Loop over ranges + CTR[] rs = ps[i].getRArray(); + for (int j = 0; j < rs.length; j++) { + // Loop over text runs + CTText[] texts = rs[j].getTArray(); + for (int k = 0; k < texts.length; k++) { + text.append( + texts[k].getStringValue() + ); + } + } + // New line after each paragraph. + text.append("\n"); + } + + return text.toString(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/HWPFXMLDocument.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/HWPFXMLDocument.java new file mode 100644 index 0000000000..64597e83dc --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/HWPFXMLDocument.java @@ -0,0 +1,36 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.usermodel; + +import org.apache.poi.POIXMLDocument; +import org.apache.poi.hwpf.HWPFXML; + +/** + * High level representation of a ooxml text document. + */ +public class HWPFXMLDocument extends POIXMLDocument { + private HWPFXML hwpfXML; + + public HWPFXMLDocument(HWPFXML xml) { + super(xml); + this.hwpfXML = xml; + } + + public HWPFXML _getHWPFXML() { + return hwpfXML; + } +} diff --git a/src/ooxml/testcases/org/apache/poi/hslf/TestHSLFXML.java b/src/ooxml/testcases/org/apache/poi/hslf/TestHSLFXML.java deleted file mode 100644 index fd4653a854..0000000000 --- a/src/ooxml/testcases/org/apache/poi/hslf/TestHSLFXML.java +++ /dev/null @@ -1,127 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hslf; - -import java.io.File; - -import org.apache.poi.hxf.HXFDocument; -import org.openxml4j.opc.Package; -import org.openxml4j.opc.PackagePart; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry; - -import junit.framework.TestCase; - -public class TestHSLFXML extends TestCase { - private File sampleFile; - - protected void setUp() throws Exception { - super.setUp(); - - sampleFile = new File( - System.getProperty("HSLF.testdata.path") + - File.separator + "sample.pptx" - ); - } - - public void testContainsMainContentType() throws Exception { - Package pack = HXFDocument.openPackage(sampleFile); - - boolean found = false; - for(PackagePart part : pack.getParts()) { - if(part.getContentType().equals(HSLFXML.MAIN_CONTENT_TYPE)) { - found = true; - } - System.out.println(part); - } - assertTrue(found); - } - - public void testOpen() throws Exception { - HXFDocument.openPackage(sampleFile); - - HSLFXML xml; - - // With the finalised uri, should be fine - xml = new HSLFXML( - HXFDocument.openPackage(sampleFile) - ); - - // Check the core - assertNotNull(xml.getPresentation()); - - // Check it has some slides - assertTrue( - xml.getSlideReferences().sizeOfSldIdArray() > 0 - ); - assertTrue( - xml.getSlideMasterReferences().sizeOfSldMasterIdArray() > 0 - ); - } - - public void testSlideBasics() throws Exception { - HSLFXML xml = new HSLFXML( - HXFDocument.openPackage(sampleFile) - ); - - // Should have 1 master - assertEquals(1, xml.getSlideMasterReferences().sizeOfSldMasterIdArray()); - assertEquals(1, xml.getSlideMasterReferences().getSldMasterIdArray().length); - - // Should have three sheets - assertEquals(2, xml.getSlideReferences().sizeOfSldIdArray()); - assertEquals(2, xml.getSlideReferences().getSldIdArray().length); - - // Check they're as expected - CTSlideIdListEntry[] slides = xml.getSlideReferences().getSldIdArray(); - assertEquals(256, slides[0].getId()); - assertEquals(257, slides[1].getId()); - assertEquals("rId2", slides[0].getId2()); - assertEquals("rId3", slides[1].getId2()); - - // Now get those objects - assertNotNull(xml.getSlide(slides[0])); - assertNotNull(xml.getSlide(slides[1])); - - // And check they have notes as expected - assertNotNull(xml.getNotes(slides[0])); - assertNotNull(xml.getNotes(slides[1])); - - // And again for the master - CTSlideMasterIdListEntry[] masters = - xml.getSlideMasterReferences().getSldMasterIdArray(); - assertEquals(2147483648l, masters[0].getId()); - assertEquals("rId1", masters[0].getId2()); - assertNotNull(xml.getSlideMaster(masters[0])); - } - - public void testMetadataBasics() throws Exception { - HSLFXML xml = new HSLFXML( - HXFDocument.openPackage(sampleFile) - ); - - assertNotNull(xml.getCoreProperties()); - assertNotNull(xml.getExtendedProperties()); - - assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication()); - assertEquals(0, xml.getExtendedProperties().getCharacters()); - assertEquals(0, xml.getExtendedProperties().getLines()); - - assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); - assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); - } -} diff --git a/src/ooxml/testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java b/src/ooxml/testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java deleted file mode 100644 index 6a006ab5c8..0000000000 --- a/src/ooxml/testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java +++ /dev/null @@ -1,109 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hslf.extractor; - -import java.io.File; - -import org.apache.poi.hslf.HSLFXML; -import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow; -import org.apache.poi.hxf.HXFDocument; - -import junit.framework.TestCase; - -/** - * Tests for HXFPowerPointExtractor - */ -public class TestHXFPowerPointExtractor extends TestCase { - /** - * A simple file - */ - private HSLFXML xmlA; - - protected void setUp() throws Exception { - super.setUp(); - - File fileA = new File( - System.getProperty("HSLF.testdata.path") + - File.separator + "sample.pptx" - ); - - xmlA = new HSLFXML(HXFDocument.openPackage(fileA)); - } - - /** - * Get text out of the simple file - */ - public void testGetSimpleText() throws Exception { - new HXFPowerPointExtractor(xmlA.getPackage()); - new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA)); - - HXFPowerPointExtractor extractor = - new HXFPowerPointExtractor(xmlA.getPackage()); - extractor.getText(); - - String text = extractor.getText(); - assertTrue(text.length() > 0); - - // Check Basics - assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n")); - assertTrue(text.endsWith("amet\n\n")); - - // Just slides, no notes - text = extractor.getText(true, false); - assertEquals( - "Lorem ipsum dolor sit amet\n" + - "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + - "\n" + - "Lorem ipsum dolor sit amet\n" + - "Lorem\n" + - "ipsum\n" + - "dolor\n" + - "sit\n" + - "amet\n" + - "\n", text - ); - - // Just notes, no slides - text = extractor.getText(false, true); - assertEquals( - "\n\n\n\n", text - ); - - // Both - text = extractor.getText(true, true); - assertEquals( - "Lorem ipsum dolor sit amet\n" + - "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + - "\n\n\n" + - "Lorem ipsum dolor sit amet\n" + - "Lorem\n" + - "ipsum\n" + - "dolor\n" + - "sit\n" + - "amet\n" + - "\n\n\n", text - ); - - // Via set defaults - extractor.setSlidesByDefault(false); - extractor.setNotesByDefault(true); - text = extractor.getText(); - assertEquals( - "\n\n\n\n", text - ); - } -} diff --git a/src/ooxml/testcases/org/apache/poi/hwpf/TestHWPFXML.java b/src/ooxml/testcases/org/apache/poi/hwpf/TestHWPFXML.java deleted file mode 100644 index 0d8e196f43..0000000000 --- a/src/ooxml/testcases/org/apache/poi/hwpf/TestHWPFXML.java +++ /dev/null @@ -1,110 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf; - -import java.io.File; - -import org.apache.poi.hxf.HXFDocument; -import org.openxml4j.opc.Package; -import org.openxml4j.opc.PackagePart; - -import junit.framework.TestCase; - -public class TestHWPFXML extends TestCase { - private File sampleFile; - private File complexFile; - - protected void setUp() throws Exception { - super.setUp(); - - sampleFile = new File( - System.getProperty("HWPF.testdata.path") + - File.separator + "sample.docx" - ); - complexFile = new File( - System.getProperty("HWPF.testdata.path") + - File.separator + "IllustrativeCases.docx" - ); - } - - public void testContainsMainContentType() throws Exception { - Package pack = HXFDocument.openPackage(sampleFile); - - boolean found = false; - for(PackagePart part : pack.getParts()) { - if(part.getContentType().equals(HWPFXML.MAIN_CONTENT_TYPE)) { - found = true; - } - System.out.println(part); - } - assertTrue(found); - } - - public void testOpen() throws Exception { - HXFDocument.openPackage(sampleFile); - HXFDocument.openPackage(complexFile); - - HWPFXML xml; - - // Simple file - xml = new HWPFXML( - HXFDocument.openPackage(sampleFile) - ); - // Check it has key parts - assertNotNull(xml.getDocument()); - assertNotNull(xml.getDocumentBody()); - assertNotNull(xml.getStyle()); - - // Complex file - xml = new HWPFXML( - HXFDocument.openPackage(complexFile) - ); - assertNotNull(xml.getDocument()); - assertNotNull(xml.getDocumentBody()); - assertNotNull(xml.getStyle()); - } - - public void testMetadataBasics() throws Exception { - HWPFXML xml = new HWPFXML( - HXFDocument.openPackage(sampleFile) - ); - assertNotNull(xml.getCoreProperties()); - assertNotNull(xml.getExtendedProperties()); - - assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication()); - assertEquals(1315, xml.getExtendedProperties().getCharacters()); - assertEquals(10, xml.getExtendedProperties().getLines()); - - assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); - assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); - } - - public void testMetadataComplex() throws Exception { - HWPFXML xml = new HWPFXML( - HXFDocument.openPackage(complexFile) - ); - assertNotNull(xml.getCoreProperties()); - assertNotNull(xml.getExtendedProperties()); - - assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication()); - assertEquals(5184, xml.getExtendedProperties().getCharacters()); - assertEquals(0, xml.getExtendedProperties().getLines()); - - assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue()); - assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue()); - } -} diff --git a/src/ooxml/testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java deleted file mode 100644 index 62695b3a8c..0000000000 --- a/src/ooxml/testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java +++ /dev/null @@ -1,117 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hwpf.extractor; - -import java.io.File; - -import org.apache.poi.hwpf.HWPFXML; -import org.apache.poi.hwpf.usermodel.HWPFXMLDocument; -import org.apache.poi.hxf.HXFDocument; - -import junit.framework.TestCase; - -/** - * Tests for HXFWordExtractor - */ -public class TestHXFWordExtractor extends TestCase { - /** - * A very simple file - */ - private HWPFXML xmlA; - /** - * A fairly complex file - */ - private HWPFXML xmlB; - - protected void setUp() throws Exception { - super.setUp(); - - File fileA = new File( - System.getProperty("HWPF.testdata.path") + - File.separator + "sample.docx" - ); - File fileB = new File( - System.getProperty("HWPF.testdata.path") + - File.separator + "IllustrativeCases.docx" - ); - - xmlA = new HWPFXML(HXFDocument.openPackage(fileA)); - xmlB = new HWPFXML(HXFDocument.openPackage(fileB)); - } - - /** - * Get text out of the simple file - */ - public void testGetSimpleText() throws Exception { - new HXFWordExtractor(xmlA.getPackage()); - new HXFWordExtractor(new HWPFXMLDocument(xmlA)); - - HXFWordExtractor extractor = - new HXFWordExtractor(xmlA.getPackage()); - extractor.getText(); - - String text = extractor.getText(); - assertTrue(text.length() > 0); - - // Check contents - assertTrue(text.startsWith( - "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio." - )); - assertTrue(text.endsWith( - "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n" - )); - - // Check number of paragraphs - int ps = 0; - char[] t = text.toCharArray(); - for (int i = 0; i < t.length; i++) { - if(t[i] == '\n') { ps++; } - } - assertEquals(3, ps); - } - - /** - * Tests getting the text out of a complex file - */ - public void testGetComplexText() throws Exception { - HXFWordExtractor extractor = - new HXFWordExtractor(xmlB.getPackage()); - extractor.getText(); - - String text = extractor.getText(); - assertTrue(text.length() > 0); - - char euro = '\u20ac'; - System.err.println("'"+text.substring(text.length() - 20) + "'"); - - // Check contents - assertTrue(text.startsWith( - " \n(V) ILLUSTRATIVE CASES\n\n" - )); - assertTrue(text.endsWith( - "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" - )); - - // Check number of paragraphs - int ps = 0; - char[] t = text.toCharArray(); - for (int i = 0; i < t.length; i++) { - if(t[i] == '\n') { ps++; } - } - assertEquals(79, ps); - } -} diff --git a/src/ooxml/testcases/org/apache/poi/xslf/TestHSLFXML.java b/src/ooxml/testcases/org/apache/poi/xslf/TestHSLFXML.java new file mode 100644 index 0000000000..fd4653a854 --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xslf/TestHSLFXML.java @@ -0,0 +1,127 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hslf; + +import java.io.File; + +import org.apache.poi.hxf.HXFDocument; +import org.openxml4j.opc.Package; +import org.openxml4j.opc.PackagePart; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry; + +import junit.framework.TestCase; + +public class TestHSLFXML extends TestCase { + private File sampleFile; + + protected void setUp() throws Exception { + super.setUp(); + + sampleFile = new File( + System.getProperty("HSLF.testdata.path") + + File.separator + "sample.pptx" + ); + } + + public void testContainsMainContentType() throws Exception { + Package pack = HXFDocument.openPackage(sampleFile); + + boolean found = false; + for(PackagePart part : pack.getParts()) { + if(part.getContentType().equals(HSLFXML.MAIN_CONTENT_TYPE)) { + found = true; + } + System.out.println(part); + } + assertTrue(found); + } + + public void testOpen() throws Exception { + HXFDocument.openPackage(sampleFile); + + HSLFXML xml; + + // With the finalised uri, should be fine + xml = new HSLFXML( + HXFDocument.openPackage(sampleFile) + ); + + // Check the core + assertNotNull(xml.getPresentation()); + + // Check it has some slides + assertTrue( + xml.getSlideReferences().sizeOfSldIdArray() > 0 + ); + assertTrue( + xml.getSlideMasterReferences().sizeOfSldMasterIdArray() > 0 + ); + } + + public void testSlideBasics() throws Exception { + HSLFXML xml = new HSLFXML( + HXFDocument.openPackage(sampleFile) + ); + + // Should have 1 master + assertEquals(1, xml.getSlideMasterReferences().sizeOfSldMasterIdArray()); + assertEquals(1, xml.getSlideMasterReferences().getSldMasterIdArray().length); + + // Should have three sheets + assertEquals(2, xml.getSlideReferences().sizeOfSldIdArray()); + assertEquals(2, xml.getSlideReferences().getSldIdArray().length); + + // Check they're as expected + CTSlideIdListEntry[] slides = xml.getSlideReferences().getSldIdArray(); + assertEquals(256, slides[0].getId()); + assertEquals(257, slides[1].getId()); + assertEquals("rId2", slides[0].getId2()); + assertEquals("rId3", slides[1].getId2()); + + // Now get those objects + assertNotNull(xml.getSlide(slides[0])); + assertNotNull(xml.getSlide(slides[1])); + + // And check they have notes as expected + assertNotNull(xml.getNotes(slides[0])); + assertNotNull(xml.getNotes(slides[1])); + + // And again for the master + CTSlideMasterIdListEntry[] masters = + xml.getSlideMasterReferences().getSldMasterIdArray(); + assertEquals(2147483648l, masters[0].getId()); + assertEquals("rId1", masters[0].getId2()); + assertNotNull(xml.getSlideMaster(masters[0])); + } + + public void testMetadataBasics() throws Exception { + HSLFXML xml = new HSLFXML( + HXFDocument.openPackage(sampleFile) + ); + + assertNotNull(xml.getCoreProperties()); + assertNotNull(xml.getExtendedProperties()); + + assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication()); + assertEquals(0, xml.getExtendedProperties().getCharacters()); + assertEquals(0, xml.getExtendedProperties().getLines()); + + assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); + assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); + } +} diff --git a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestHXFPowerPointExtractor.java b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestHXFPowerPointExtractor.java new file mode 100644 index 0000000000..6a006ab5c8 --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestHXFPowerPointExtractor.java @@ -0,0 +1,109 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hslf.extractor; + +import java.io.File; + +import org.apache.poi.hslf.HSLFXML; +import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow; +import org.apache.poi.hxf.HXFDocument; + +import junit.framework.TestCase; + +/** + * Tests for HXFPowerPointExtractor + */ +public class TestHXFPowerPointExtractor extends TestCase { + /** + * A simple file + */ + private HSLFXML xmlA; + + protected void setUp() throws Exception { + super.setUp(); + + File fileA = new File( + System.getProperty("HSLF.testdata.path") + + File.separator + "sample.pptx" + ); + + xmlA = new HSLFXML(HXFDocument.openPackage(fileA)); + } + + /** + * Get text out of the simple file + */ + public void testGetSimpleText() throws Exception { + new HXFPowerPointExtractor(xmlA.getPackage()); + new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA)); + + HXFPowerPointExtractor extractor = + new HXFPowerPointExtractor(xmlA.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check Basics + assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n")); + assertTrue(text.endsWith("amet\n\n")); + + // Just slides, no notes + text = extractor.getText(true, false); + assertEquals( + "Lorem ipsum dolor sit amet\n" + + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + + "\n" + + "Lorem ipsum dolor sit amet\n" + + "Lorem\n" + + "ipsum\n" + + "dolor\n" + + "sit\n" + + "amet\n" + + "\n", text + ); + + // Just notes, no slides + text = extractor.getText(false, true); + assertEquals( + "\n\n\n\n", text + ); + + // Both + text = extractor.getText(true, true); + assertEquals( + "Lorem ipsum dolor sit amet\n" + + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + + "\n\n\n" + + "Lorem ipsum dolor sit amet\n" + + "Lorem\n" + + "ipsum\n" + + "dolor\n" + + "sit\n" + + "amet\n" + + "\n\n\n", text + ); + + // Via set defaults + extractor.setSlidesByDefault(false); + extractor.setNotesByDefault(true); + text = extractor.getText(); + assertEquals( + "\n\n\n\n", text + ); + } +} diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/TestHWPFXML.java b/src/ooxml/testcases/org/apache/poi/xwpf/TestHWPFXML.java new file mode 100644 index 0000000000..0d8e196f43 --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xwpf/TestHWPFXML.java @@ -0,0 +1,110 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf; + +import java.io.File; + +import org.apache.poi.hxf.HXFDocument; +import org.openxml4j.opc.Package; +import org.openxml4j.opc.PackagePart; + +import junit.framework.TestCase; + +public class TestHWPFXML extends TestCase { + private File sampleFile; + private File complexFile; + + protected void setUp() throws Exception { + super.setUp(); + + sampleFile = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "sample.docx" + ); + complexFile = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "IllustrativeCases.docx" + ); + } + + public void testContainsMainContentType() throws Exception { + Package pack = HXFDocument.openPackage(sampleFile); + + boolean found = false; + for(PackagePart part : pack.getParts()) { + if(part.getContentType().equals(HWPFXML.MAIN_CONTENT_TYPE)) { + found = true; + } + System.out.println(part); + } + assertTrue(found); + } + + public void testOpen() throws Exception { + HXFDocument.openPackage(sampleFile); + HXFDocument.openPackage(complexFile); + + HWPFXML xml; + + // Simple file + xml = new HWPFXML( + HXFDocument.openPackage(sampleFile) + ); + // Check it has key parts + assertNotNull(xml.getDocument()); + assertNotNull(xml.getDocumentBody()); + assertNotNull(xml.getStyle()); + + // Complex file + xml = new HWPFXML( + HXFDocument.openPackage(complexFile) + ); + assertNotNull(xml.getDocument()); + assertNotNull(xml.getDocumentBody()); + assertNotNull(xml.getStyle()); + } + + public void testMetadataBasics() throws Exception { + HWPFXML xml = new HWPFXML( + HXFDocument.openPackage(sampleFile) + ); + assertNotNull(xml.getCoreProperties()); + assertNotNull(xml.getExtendedProperties()); + + assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication()); + assertEquals(1315, xml.getExtendedProperties().getCharacters()); + assertEquals(10, xml.getExtendedProperties().getLines()); + + assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); + assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); + } + + public void testMetadataComplex() throws Exception { + HWPFXML xml = new HWPFXML( + HXFDocument.openPackage(complexFile) + ); + assertNotNull(xml.getCoreProperties()); + assertNotNull(xml.getExtendedProperties()); + + assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication()); + assertEquals(5184, xml.getExtendedProperties().getCharacters()); + assertEquals(0, xml.getExtendedProperties().getLines()); + + assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue()); + assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue()); + } +} diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestHXFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestHXFWordExtractor.java new file mode 100644 index 0000000000..62695b3a8c --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestHXFWordExtractor.java @@ -0,0 +1,117 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.extractor; + +import java.io.File; + +import org.apache.poi.hwpf.HWPFXML; +import org.apache.poi.hwpf.usermodel.HWPFXMLDocument; +import org.apache.poi.hxf.HXFDocument; + +import junit.framework.TestCase; + +/** + * Tests for HXFWordExtractor + */ +public class TestHXFWordExtractor extends TestCase { + /** + * A very simple file + */ + private HWPFXML xmlA; + /** + * A fairly complex file + */ + private HWPFXML xmlB; + + protected void setUp() throws Exception { + super.setUp(); + + File fileA = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "sample.docx" + ); + File fileB = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "IllustrativeCases.docx" + ); + + xmlA = new HWPFXML(HXFDocument.openPackage(fileA)); + xmlB = new HWPFXML(HXFDocument.openPackage(fileB)); + } + + /** + * Get text out of the simple file + */ + public void testGetSimpleText() throws Exception { + new HXFWordExtractor(xmlA.getPackage()); + new HXFWordExtractor(new HWPFXMLDocument(xmlA)); + + HXFWordExtractor extractor = + new HXFWordExtractor(xmlA.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check contents + assertTrue(text.startsWith( + "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio." + )); + assertTrue(text.endsWith( + "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n" + )); + + // Check number of paragraphs + int ps = 0; + char[] t = text.toCharArray(); + for (int i = 0; i < t.length; i++) { + if(t[i] == '\n') { ps++; } + } + assertEquals(3, ps); + } + + /** + * Tests getting the text out of a complex file + */ + public void testGetComplexText() throws Exception { + HXFWordExtractor extractor = + new HXFWordExtractor(xmlB.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + char euro = '\u20ac'; + System.err.println("'"+text.substring(text.length() - 20) + "'"); + + // Check contents + assertTrue(text.startsWith( + " \n(V) ILLUSTRATIVE CASES\n\n" + )); + assertTrue(text.endsWith( + "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" + )); + + // Check number of paragraphs + int ps = 0; + char[] t = text.toCharArray(); + for (int i = 0; i < t.length; i++) { + if(t[i] == '\n') { ps++; } + } + assertEquals(79, ps); + } +}