+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf;
-
-import java.io.IOException;
-
-import org.apache.poi.hxf.HXFDocument;
-import org.apache.xmlbeans.XmlException;
-import org.openxml4j.exceptions.InvalidFormatException;
-import org.openxml4j.exceptions.OpenXML4JException;
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackagePart;
-import org.openxml4j.opc.PackageRelationshipCollection;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMaster;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdList;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
-import org.openxmlformats.schemas.presentationml.x2006.main.NotesDocument;
-import org.openxmlformats.schemas.presentationml.x2006.main.PresentationDocument;
-import org.openxmlformats.schemas.presentationml.x2006.main.SldDocument;
-import org.openxmlformats.schemas.presentationml.x2006.main.SldMasterDocument;
-
-/**
- * Experimental class to do low level processing
- * of pptx files.
- *
- * If you are using these low level classes, then you
- * will almost certainly need to refer to the OOXML
- * specifications from
- * http://www.ecma-international.org/publications/standards/Ecma-376.htm
- *
- * WARNING - APIs expected to change rapidly
- */
-public class HSLFXML extends HXFDocument {
- public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml";
- public static final String NOTES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml";
- public static final String SLIDE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml";
- public static final String SLIDE_LAYOUT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout";
- public static final String NOTES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide";
-
- private PresentationDocument presentationDoc;
-
- public HSLFXML(Package container) throws OpenXML4JException, IOException, XmlException {
- super(container, MAIN_CONTENT_TYPE);
-
- presentationDoc =
- PresentationDocument.Factory.parse(basePart.getInputStream());
- }
-
- /**
- * Returns the low level presentation base object
- */
- public CTPresentation getPresentation() {
- return presentationDoc.getPresentation();
- }
-
- /**
- * Returns the references from the presentation to its
- * slides.
- * You'll need these to figure out the slide ordering,
- * and to get at the actual slides themselves
- */
- public CTSlideIdList getSlideReferences() {
- return getPresentation().getSldIdLst();
- }
- /**
- * Returns the references from the presentation to its
- * slide masters.
- * You'll need these to get at the actual slide
- * masters themselves
- */
- public CTSlideMasterIdList getSlideMasterReferences() {
- return getPresentation().getSldMasterIdLst();
- }
-
- /**
- * Returns the low level slide master object from
- * the supplied slide master reference
- */
- public CTSlideMaster getSlideMaster(CTSlideMasterIdListEntry master) throws IOException, XmlException {
- PackagePart masterPart =
- getRelatedPackagePart(master.getId2());
- SldMasterDocument masterDoc =
- SldMasterDocument.Factory.parse(masterPart.getInputStream());
- return masterDoc.getSldMaster();
- }
-
- /**
- * Returns the low level slide object from
- * the supplied slide reference
- */
- public CTSlide getSlide(CTSlideIdListEntry slide) throws IOException, XmlException {
- PackagePart slidePart =
- getRelatedPackagePart(slide.getId2());
- SldDocument slideDoc =
- SldDocument.Factory.parse(slidePart.getInputStream());
- return slideDoc.getSld();
- }
-
- /**
- * Returns the low level notes object for the given
- * slide, as found from the supplied slide reference
- */
- public CTNotesSlide getNotes(CTSlideIdListEntry slide) throws IOException, XmlException {
- PackagePart slidePart =
- getRelatedPackagePart(slide.getId2());
-
- PackageRelationshipCollection notes;
- try {
- notes = slidePart.getRelationshipsByType(NOTES_RELATION_TYPE);
- } catch(InvalidFormatException e) {
- throw new IllegalStateException(e);
- }
-
- if(notes.size() == 0) {
- // No notes for this slide
- return null;
- }
- if(notes.size() > 1) {
- throw new IllegalStateException("Expecting 0 or 1 notes for a slide, but found " + notes.size());
- }
-
- PackagePart notesPart =
- getPackagePart(notes.getRelationship(0));
- NotesDocument notesDoc =
- NotesDocument.Factory.parse(notesPart.getInputStream());
-
- return notesDoc.getNotes();
- }
-}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf.extractor;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.hslf.HSLFXML;
-import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
-import org.apache.poi.hxf.HXFDocument;
-import org.apache.xmlbeans.XmlException;
-import org.openxml4j.exceptions.OpenXML4JException;
-import org.openxml4j.opc.Package;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
-
-public class HXFPowerPointExtractor extends POIXMLTextExtractor {
- private HSLFXMLSlideShow slideshow;
- private boolean slidesByDefault = true;
- private boolean notesByDefault = false;
-
- public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
- this(new HSLFXMLSlideShow(
- new HSLFXML(container)
- ));
- }
- public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
- super(slideshow);
- this.slideshow = slideshow;
- }
-
- public static void main(String[] args) throws Exception {
- if(args.length < 1) {
- System.err.println("Use:");
- System.err.println(" HXFPowerPointExtractor <filename.pptx>");
- System.exit(1);
- }
- POIXMLTextExtractor extractor =
- new HXFPowerPointExtractor(HXFDocument.openPackage(
- new File(args[0])
- ));
- System.out.println(extractor.getText());
- }
-
- /**
- * Should a call to getText() return slide text?
- * Default is yes
- */
- public void setSlidesByDefault(boolean slidesByDefault) {
- this.slidesByDefault = slidesByDefault;
- }
- /**
- * Should a call to getText() return notes text?
- * Default is no
- */
- public void setNotesByDefault(boolean notesByDefault) {
- this.notesByDefault = notesByDefault;
- }
-
- /**
- * Gets the slide text, but not the notes text
- */
- public String getText() {
- return getText(slidesByDefault, notesByDefault);
- }
-
- /**
- * Gets the requested text from the file
- * @param slideText Should we retrieve text from slides?
- * @param notesText Should we retrieve text from notes?
- */
- public String getText(boolean slideText, boolean notesText) {
- StringBuffer text = new StringBuffer();
-
- CTSlideIdListEntry[] slideRefs =
- slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
- for (int i = 0; i < slideRefs.length; i++) {
- try {
- CTSlide slide =
- slideshow._getHSLFXML().getSlide(slideRefs[i]);
- CTNotesSlide notes =
- slideshow._getHSLFXML().getNotes(slideRefs[i]);
-
- if(slideText) {
- extractText(slide.getCSld().getSpTree(), text);
- }
- if(notesText && notes != null) {
- extractText(notes.getCSld().getSpTree(), text);
- }
- } catch(Exception e) {
- throw new RuntimeException(e);
- }
- }
-
- return text.toString();
- }
-
- private void extractText(CTGroupShape gs, StringBuffer text) {
- CTShape[] shapes = gs.getSpArray();
- for (int i = 0; i < shapes.length; i++) {
- CTTextBody textBody =
- shapes[i].getTxBody();
- if(textBody != null) {
- CTTextParagraph[] paras =
- textBody.getPArray();
- for (int j = 0; j < paras.length; j++) {
- CTRegularTextRun[] textRuns =
- paras[j].getRArray();
- for (int k = 0; k < textRuns.length; k++) {
- text.append( textRuns[k].getT() );
- }
- // End each paragraph with a new line
- text.append("\n");
- }
- }
- }
- }
-}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf.usermodel;
-
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.hslf.HSLFXML;
-
-/**
- * High level representation of a ooxml slideshow.
- * This is the first object most users will construct whether
- * they are reading or writing a slideshow. It is also the
- * top level object for creating new slides/etc.
- */
-public class HSLFXMLSlideShow extends POIXMLDocument {
- private org.apache.poi.hslf.HSLFXML hslfXML;
-
- public HSLFXMLSlideShow(HSLFXML xml) {
- super(xml);
- this.hslfXML = xml;
- }
-
- public HSLFXML _getHSLFXML() {
- return hslfXML;
- }
-}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf;
-
-import java.io.IOException;
-
-import org.apache.poi.hxf.HXFDocument;
-import org.apache.xmlbeans.XmlException;
-import org.openxml4j.exceptions.InvalidFormatException;
-import org.openxml4j.exceptions.OpenXML4JException;
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackagePart;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument;
-
-/**
- * Experimental class to do low level processing
- * of docx files.
- *
- * If you are using these low level classes, then you
- * will almost certainly need to refer to the OOXML
- * specifications from
- * http://www.ecma-international.org/publications/standards/Ecma-376.htm
- *
- * WARNING - APIs expected to change rapidly
- */
-public class HWPFXML extends HXFDocument {
- public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml";
- public static final String FOOTER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml";
- public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
- public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
- public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
-
- private DocumentDocument wordDoc;
-
- public HWPFXML(Package container) throws OpenXML4JException, IOException, XmlException {
- super(container, MAIN_CONTENT_TYPE);
-
- wordDoc =
- DocumentDocument.Factory.parse(basePart.getInputStream());
- }
-
- /**
- * Returns the low level document base object
- */
- public CTDocument1 getDocument() {
- return wordDoc.getDocument();
- }
-
- /**
- * Returns the low level body of the document
- */
- public CTBody getDocumentBody() {
- return getDocument().getBody();
- }
-
- /**
- * Returns the styles object used
- */
- public CTStyles getStyle() throws XmlException, IOException {
- PackagePart[] parts;
- try {
- parts = getRelatedByType(STYLES_RELATION_TYPE);
- } catch(InvalidFormatException e) {
- throw new IllegalStateException(e);
- }
- if(parts.length != 1) {
- throw new IllegalStateException("Expecting one Styles document part, but found " + parts.length);
- }
-
- StylesDocument sd =
- StylesDocument.Factory.parse(parts[0].getInputStream());
- return sd.getStyles();
- }
-}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf.extractor;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.hwpf.HWPFXML;
-import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
-import org.apache.poi.hxf.HXFDocument;
-import org.apache.xmlbeans.XmlException;
-import org.openxml4j.exceptions.OpenXML4JException;
-import org.openxml4j.opc.Package;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
-
-/**
- * Helper class to extract text from an OOXML Word file
- */
-public class HXFWordExtractor extends POIXMLTextExtractor {
- private HWPFXMLDocument document;
-
- public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
- this(new HWPFXMLDocument(
- new HWPFXML(container)
- ));
- }
- public HXFWordExtractor(HWPFXMLDocument document) {
- super(document);
- this.document = document;
- }
-
- public static void main(String[] args) throws Exception {
- if(args.length < 1) {
- System.err.println("Use:");
- System.err.println(" HXFWordExtractor <filename.xlsx>");
- System.exit(1);
- }
- POIXMLTextExtractor extractor =
- new HXFWordExtractor(HXFDocument.openPackage(
- new File(args[0])
- ));
- System.out.println(extractor.getText());
- }
-
- public String getText() {
- CTBody body = document._getHWPFXML().getDocumentBody();
- StringBuffer text = new StringBuffer();
-
- // Loop over paragraphs
- CTP[] ps = body.getPArray();
- for (int i = 0; i < ps.length; i++) {
- // Loop over ranges
- CTR[] rs = ps[i].getRArray();
- for (int j = 0; j < rs.length; j++) {
- // Loop over text runs
- CTText[] texts = rs[j].getTArray();
- for (int k = 0; k < texts.length; k++) {
- text.append(
- texts[k].getStringValue()
- );
- }
- }
- // New line after each paragraph.
- text.append("\n");
- }
-
- return text.toString();
- }
-}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf.usermodel;
-
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.hwpf.HWPFXML;
-
-/**
- * High level representation of a ooxml text document.
- */
-public class HWPFXMLDocument extends POIXMLDocument {
- private HWPFXML hwpfXML;
-
- public HWPFXMLDocument(HWPFXML xml) {
- super(xml);
- this.hwpfXML = xml;
- }
-
- public HWPFXML _getHWPFXML() {
- return hwpfXML;
- }
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf;
+
+import java.io.IOException;
+
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.InvalidFormatException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackagePart;
+import org.openxml4j.opc.PackageRelationshipCollection;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMaster;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
+import org.openxmlformats.schemas.presentationml.x2006.main.NotesDocument;
+import org.openxmlformats.schemas.presentationml.x2006.main.PresentationDocument;
+import org.openxmlformats.schemas.presentationml.x2006.main.SldDocument;
+import org.openxmlformats.schemas.presentationml.x2006.main.SldMasterDocument;
+
+/**
+ * Experimental class to do low level processing
+ * of pptx files.
+ *
+ * If you are using these low level classes, then you
+ * will almost certainly need to refer to the OOXML
+ * specifications from
+ * http://www.ecma-international.org/publications/standards/Ecma-376.htm
+ *
+ * WARNING - APIs expected to change rapidly
+ */
+public class HSLFXML extends HXFDocument {
+ public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml";
+ public static final String NOTES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml";
+ public static final String SLIDE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml";
+ public static final String SLIDE_LAYOUT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout";
+ public static final String NOTES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide";
+
+ private PresentationDocument presentationDoc;
+
+ public HSLFXML(Package container) throws OpenXML4JException, IOException, XmlException {
+ super(container, MAIN_CONTENT_TYPE);
+
+ presentationDoc =
+ PresentationDocument.Factory.parse(basePart.getInputStream());
+ }
+
+ /**
+ * Returns the low level presentation base object
+ */
+ public CTPresentation getPresentation() {
+ return presentationDoc.getPresentation();
+ }
+
+ /**
+ * Returns the references from the presentation to its
+ * slides.
+ * You'll need these to figure out the slide ordering,
+ * and to get at the actual slides themselves
+ */
+ public CTSlideIdList getSlideReferences() {
+ return getPresentation().getSldIdLst();
+ }
+ /**
+ * Returns the references from the presentation to its
+ * slide masters.
+ * You'll need these to get at the actual slide
+ * masters themselves
+ */
+ public CTSlideMasterIdList getSlideMasterReferences() {
+ return getPresentation().getSldMasterIdLst();
+ }
+
+ /**
+ * Returns the low level slide master object from
+ * the supplied slide master reference
+ */
+ public CTSlideMaster getSlideMaster(CTSlideMasterIdListEntry master) throws IOException, XmlException {
+ PackagePart masterPart =
+ getRelatedPackagePart(master.getId2());
+ SldMasterDocument masterDoc =
+ SldMasterDocument.Factory.parse(masterPart.getInputStream());
+ return masterDoc.getSldMaster();
+ }
+
+ /**
+ * Returns the low level slide object from
+ * the supplied slide reference
+ */
+ public CTSlide getSlide(CTSlideIdListEntry slide) throws IOException, XmlException {
+ PackagePart slidePart =
+ getRelatedPackagePart(slide.getId2());
+ SldDocument slideDoc =
+ SldDocument.Factory.parse(slidePart.getInputStream());
+ return slideDoc.getSld();
+ }
+
+ /**
+ * Returns the low level notes object for the given
+ * slide, as found from the supplied slide reference
+ */
+ public CTNotesSlide getNotes(CTSlideIdListEntry slide) throws IOException, XmlException {
+ PackagePart slidePart =
+ getRelatedPackagePart(slide.getId2());
+
+ PackageRelationshipCollection notes;
+ try {
+ notes = slidePart.getRelationshipsByType(NOTES_RELATION_TYPE);
+ } catch(InvalidFormatException e) {
+ throw new IllegalStateException(e);
+ }
+
+ if(notes.size() == 0) {
+ // No notes for this slide
+ return null;
+ }
+ if(notes.size() > 1) {
+ throw new IllegalStateException("Expecting 0 or 1 notes for a slide, but found " + notes.size());
+ }
+
+ PackagePart notesPart =
+ getPackagePart(notes.getRelationship(0));
+ NotesDocument notesDoc =
+ NotesDocument.Factory.parse(notesPart.getInputStream());
+
+ return notesDoc.getNotes();
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+
+public class HXFPowerPointExtractor extends POIXMLTextExtractor {
+ private HSLFXMLSlideShow slideshow;
+ private boolean slidesByDefault = true;
+ private boolean notesByDefault = false;
+
+ public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+ this(new HSLFXMLSlideShow(
+ new XSLFXML(container)
+ ));
+ }
+ public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
+ super(slideshow);
+ this.slideshow = slideshow;
+ }
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" HXFPowerPointExtractor <filename.pptx>");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new HXFPowerPointExtractor(HXFDocument.openPackage(
+ new File(args[0])
+ ));
+ System.out.println(extractor.getText());
+ }
+
+ /**
+ * Should a call to getText() return slide text?
+ * Default is yes
+ */
+ public void setSlidesByDefault(boolean slidesByDefault) {
+ this.slidesByDefault = slidesByDefault;
+ }
+ /**
+ * Should a call to getText() return notes text?
+ * Default is no
+ */
+ public void setNotesByDefault(boolean notesByDefault) {
+ this.notesByDefault = notesByDefault;
+ }
+
+ /**
+ * Gets the slide text, but not the notes text
+ */
+ public String getText() {
+ return getText(slidesByDefault, notesByDefault);
+ }
+
+ /**
+ * Gets the requested text from the file
+ * @param slideText Should we retrieve text from slides?
+ * @param notesText Should we retrieve text from notes?
+ */
+ public String getText(boolean slideText, boolean notesText) {
+ StringBuffer text = new StringBuffer();
+
+ CTSlideIdListEntry[] slideRefs =
+ slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
+ for (int i = 0; i < slideRefs.length; i++) {
+ try {
+ CTSlide slide =
+ slideshow._getHSLFXML().getSlide(slideRefs[i]);
+ CTNotesSlide notes =
+ slideshow._getHSLFXML().getNotes(slideRefs[i]);
+
+ if(slideText) {
+ extractText(slide.getCSld().getSpTree(), text);
+ }
+ if(notesText && notes != null) {
+ extractText(notes.getCSld().getSpTree(), text);
+ }
+ } catch(Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return text.toString();
+ }
+
+ private void extractText(CTGroupShape gs, StringBuffer text) {
+ CTShape[] shapes = gs.getSpArray();
+ for (int i = 0; i < shapes.length; i++) {
+ CTTextBody textBody =
+ shapes[i].getTxBody();
+ if(textBody != null) {
+ CTTextParagraph[] paras =
+ textBody.getPArray();
+ for (int j = 0; j < paras.length; j++) {
+ CTRegularTextRun[] textRuns =
+ paras[j].getRArray();
+ for (int k = 0; k < textRuns.length; k++) {
+ text.append( textRuns[k].getT() );
+ }
+ // End each paragraph with a new line
+ text.append("\n");
+ }
+ }
+ }
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.usermodel;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.hslf.HSLFXML;
+
+/**
+ * High level representation of a ooxml slideshow.
+ * This is the first object most users will construct whether
+ * they are reading or writing a slideshow. It is also the
+ * top level object for creating new slides/etc.
+ */
+public class HSLFXMLSlideShow extends POIXMLDocument {
+ private org.apache.poi.hslf.XSLFXML hslfXML;
+
+ public HSLFXMLSlideShow(XSLFXML xml) {
+ super(xml);
+ this.hslfXML = xml;
+ }
+
+ public XSLFXML _getHSLFXML() {
+ return hslfXML;
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import java.io.IOException;
+
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.InvalidFormatException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackagePart;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument;
+
+/**
+ * Experimental class to do low level processing
+ * of docx files.
+ *
+ * If you are using these low level classes, then you
+ * will almost certainly need to refer to the OOXML
+ * specifications from
+ * http://www.ecma-international.org/publications/standards/Ecma-376.htm
+ *
+ * WARNING - APIs expected to change rapidly
+ */
+public class HWPFXML extends HXFDocument {
+ public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml";
+ public static final String FOOTER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml";
+ public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
+ public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
+ public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
+
+ private DocumentDocument wordDoc;
+
+ public HWPFXML(Package container) throws OpenXML4JException, IOException, XmlException {
+ super(container, MAIN_CONTENT_TYPE);
+
+ wordDoc =
+ DocumentDocument.Factory.parse(basePart.getInputStream());
+ }
+
+ /**
+ * Returns the low level document base object
+ */
+ public CTDocument1 getDocument() {
+ return wordDoc.getDocument();
+ }
+
+ /**
+ * Returns the low level body of the document
+ */
+ public CTBody getDocumentBody() {
+ return getDocument().getBody();
+ }
+
+ /**
+ * Returns the styles object used
+ */
+ public CTStyles getStyle() throws XmlException, IOException {
+ PackagePart[] parts;
+ try {
+ parts = getRelatedByType(STYLES_RELATION_TYPE);
+ } catch(InvalidFormatException e) {
+ throw new IllegalStateException(e);
+ }
+ if(parts.length != 1) {
+ throw new IllegalStateException("Expecting one Styles document part, but found " + parts.length);
+ }
+
+ StylesDocument sd =
+ StylesDocument.Factory.parse(parts[0].getInputStream());
+ return sd.getStyles();
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+
+/**
+ * Helper class to extract text from an OOXML Word file
+ */
+public class HXFWordExtractor extends POIXMLTextExtractor {
+ private HWPFXMLDocument document;
+
+ public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+ this(new HWPFXMLDocument(
+ new HWPFXML(container)
+ ));
+ }
+ public HXFWordExtractor(HWPFXMLDocument document) {
+ super(document);
+ this.document = document;
+ }
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" HXFWordExtractor <filename.xlsx>");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new HXFWordExtractor(HXFDocument.openPackage(
+ new File(args[0])
+ ));
+ System.out.println(extractor.getText());
+ }
+
+ public String getText() {
+ CTBody body = document._getHWPFXML().getDocumentBody();
+ StringBuffer text = new StringBuffer();
+
+ // Loop over paragraphs
+ CTP[] ps = body.getPArray();
+ for (int i = 0; i < ps.length; i++) {
+ // Loop over ranges
+ CTR[] rs = ps[i].getRArray();
+ for (int j = 0; j < rs.length; j++) {
+ // Loop over text runs
+ CTText[] texts = rs[j].getTArray();
+ for (int k = 0; k < texts.length; k++) {
+ text.append(
+ texts[k].getStringValue()
+ );
+ }
+ }
+ // New line after each paragraph.
+ text.append("\n");
+ }
+
+ return text.toString();
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.hwpf.HWPFXML;
+
+/**
+ * High level representation of a ooxml text document.
+ */
+public class HWPFXMLDocument extends POIXMLDocument {
+ private HWPFXML hwpfXML;
+
+ public HWPFXMLDocument(HWPFXML xml) {
+ super(xml);
+ this.hwpfXML = xml;
+ }
+
+ public HWPFXML _getHWPFXML() {
+ return hwpfXML;
+ }
+}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf;
-
-import java.io.File;
-
-import org.apache.poi.hxf.HXFDocument;
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackagePart;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
-
-import junit.framework.TestCase;
-
-public class TestHSLFXML extends TestCase {
- private File sampleFile;
-
- protected void setUp() throws Exception {
- super.setUp();
-
- sampleFile = new File(
- System.getProperty("HSLF.testdata.path") +
- File.separator + "sample.pptx"
- );
- }
-
- public void testContainsMainContentType() throws Exception {
- Package pack = HXFDocument.openPackage(sampleFile);
-
- boolean found = false;
- for(PackagePart part : pack.getParts()) {
- if(part.getContentType().equals(HSLFXML.MAIN_CONTENT_TYPE)) {
- found = true;
- }
- System.out.println(part);
- }
- assertTrue(found);
- }
-
- public void testOpen() throws Exception {
- HXFDocument.openPackage(sampleFile);
-
- HSLFXML xml;
-
- // With the finalised uri, should be fine
- xml = new HSLFXML(
- HXFDocument.openPackage(sampleFile)
- );
-
- // Check the core
- assertNotNull(xml.getPresentation());
-
- // Check it has some slides
- assertTrue(
- xml.getSlideReferences().sizeOfSldIdArray() > 0
- );
- assertTrue(
- xml.getSlideMasterReferences().sizeOfSldMasterIdArray() > 0
- );
- }
-
- public void testSlideBasics() throws Exception {
- HSLFXML xml = new HSLFXML(
- HXFDocument.openPackage(sampleFile)
- );
-
- // Should have 1 master
- assertEquals(1, xml.getSlideMasterReferences().sizeOfSldMasterIdArray());
- assertEquals(1, xml.getSlideMasterReferences().getSldMasterIdArray().length);
-
- // Should have three sheets
- assertEquals(2, xml.getSlideReferences().sizeOfSldIdArray());
- assertEquals(2, xml.getSlideReferences().getSldIdArray().length);
-
- // Check they're as expected
- CTSlideIdListEntry[] slides = xml.getSlideReferences().getSldIdArray();
- assertEquals(256, slides[0].getId());
- assertEquals(257, slides[1].getId());
- assertEquals("rId2", slides[0].getId2());
- assertEquals("rId3", slides[1].getId2());
-
- // Now get those objects
- assertNotNull(xml.getSlide(slides[0]));
- assertNotNull(xml.getSlide(slides[1]));
-
- // And check they have notes as expected
- assertNotNull(xml.getNotes(slides[0]));
- assertNotNull(xml.getNotes(slides[1]));
-
- // And again for the master
- CTSlideMasterIdListEntry[] masters =
- xml.getSlideMasterReferences().getSldMasterIdArray();
- assertEquals(2147483648l, masters[0].getId());
- assertEquals("rId1", masters[0].getId2());
- assertNotNull(xml.getSlideMaster(masters[0]));
- }
-
- public void testMetadataBasics() throws Exception {
- HSLFXML xml = new HSLFXML(
- HXFDocument.openPackage(sampleFile)
- );
-
- assertNotNull(xml.getCoreProperties());
- assertNotNull(xml.getExtendedProperties());
-
- assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
- assertEquals(0, xml.getExtendedProperties().getCharacters());
- assertEquals(0, xml.getExtendedProperties().getLines());
-
- assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
- assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
- }
-}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf.extractor;
-
-import java.io.File;
-
-import org.apache.poi.hslf.HSLFXML;
-import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
-import org.apache.poi.hxf.HXFDocument;
-
-import junit.framework.TestCase;
-
-/**
- * Tests for HXFPowerPointExtractor
- */
-public class TestHXFPowerPointExtractor extends TestCase {
- /**
- * A simple file
- */
- private HSLFXML xmlA;
-
- protected void setUp() throws Exception {
- super.setUp();
-
- File fileA = new File(
- System.getProperty("HSLF.testdata.path") +
- File.separator + "sample.pptx"
- );
-
- xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
- }
-
- /**
- * Get text out of the simple file
- */
- public void testGetSimpleText() throws Exception {
- new HXFPowerPointExtractor(xmlA.getPackage());
- new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
-
- HXFPowerPointExtractor extractor =
- new HXFPowerPointExtractor(xmlA.getPackage());
- extractor.getText();
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check Basics
- assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
- assertTrue(text.endsWith("amet\n\n"));
-
- // Just slides, no notes
- text = extractor.getText(true, false);
- assertEquals(
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n", text
- );
-
- // Just notes, no slides
- text = extractor.getText(false, true);
- assertEquals(
- "\n\n\n\n", text
- );
-
- // Both
- text = extractor.getText(true, true);
- assertEquals(
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n\n\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n\n\n", text
- );
-
- // Via set defaults
- extractor.setSlidesByDefault(false);
- extractor.setNotesByDefault(true);
- text = extractor.getText();
- assertEquals(
- "\n\n\n\n", text
- );
- }
-}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf;
-
-import java.io.File;
-
-import org.apache.poi.hxf.HXFDocument;
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackagePart;
-
-import junit.framework.TestCase;
-
-public class TestHWPFXML extends TestCase {
- private File sampleFile;
- private File complexFile;
-
- protected void setUp() throws Exception {
- super.setUp();
-
- sampleFile = new File(
- System.getProperty("HWPF.testdata.path") +
- File.separator + "sample.docx"
- );
- complexFile = new File(
- System.getProperty("HWPF.testdata.path") +
- File.separator + "IllustrativeCases.docx"
- );
- }
-
- public void testContainsMainContentType() throws Exception {
- Package pack = HXFDocument.openPackage(sampleFile);
-
- boolean found = false;
- for(PackagePart part : pack.getParts()) {
- if(part.getContentType().equals(HWPFXML.MAIN_CONTENT_TYPE)) {
- found = true;
- }
- System.out.println(part);
- }
- assertTrue(found);
- }
-
- public void testOpen() throws Exception {
- HXFDocument.openPackage(sampleFile);
- HXFDocument.openPackage(complexFile);
-
- HWPFXML xml;
-
- // Simple file
- xml = new HWPFXML(
- HXFDocument.openPackage(sampleFile)
- );
- // Check it has key parts
- assertNotNull(xml.getDocument());
- assertNotNull(xml.getDocumentBody());
- assertNotNull(xml.getStyle());
-
- // Complex file
- xml = new HWPFXML(
- HXFDocument.openPackage(complexFile)
- );
- assertNotNull(xml.getDocument());
- assertNotNull(xml.getDocumentBody());
- assertNotNull(xml.getStyle());
- }
-
- public void testMetadataBasics() throws Exception {
- HWPFXML xml = new HWPFXML(
- HXFDocument.openPackage(sampleFile)
- );
- assertNotNull(xml.getCoreProperties());
- assertNotNull(xml.getExtendedProperties());
-
- assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
- assertEquals(1315, xml.getExtendedProperties().getCharacters());
- assertEquals(10, xml.getExtendedProperties().getLines());
-
- assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
- assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
- }
-
- public void testMetadataComplex() throws Exception {
- HWPFXML xml = new HWPFXML(
- HXFDocument.openPackage(complexFile)
- );
- assertNotNull(xml.getCoreProperties());
- assertNotNull(xml.getExtendedProperties());
-
- assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
- assertEquals(5184, xml.getExtendedProperties().getCharacters());
- assertEquals(0, xml.getExtendedProperties().getLines());
-
- assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
- assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
- }
-}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf.extractor;
-
-import java.io.File;
-
-import org.apache.poi.hwpf.HWPFXML;
-import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
-import org.apache.poi.hxf.HXFDocument;
-
-import junit.framework.TestCase;
-
-/**
- * Tests for HXFWordExtractor
- */
-public class TestHXFWordExtractor extends TestCase {
- /**
- * A very simple file
- */
- private HWPFXML xmlA;
- /**
- * A fairly complex file
- */
- private HWPFXML xmlB;
-
- protected void setUp() throws Exception {
- super.setUp();
-
- File fileA = new File(
- System.getProperty("HWPF.testdata.path") +
- File.separator + "sample.docx"
- );
- File fileB = new File(
- System.getProperty("HWPF.testdata.path") +
- File.separator + "IllustrativeCases.docx"
- );
-
- xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
- xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
- }
-
- /**
- * Get text out of the simple file
- */
- public void testGetSimpleText() throws Exception {
- new HXFWordExtractor(xmlA.getPackage());
- new HXFWordExtractor(new HWPFXMLDocument(xmlA));
-
- HXFWordExtractor extractor =
- new HXFWordExtractor(xmlA.getPackage());
- extractor.getText();
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check contents
- assertTrue(text.startsWith(
- "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
- ));
- assertTrue(text.endsWith(
- "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
- ));
-
- // Check number of paragraphs
- int ps = 0;
- char[] t = text.toCharArray();
- for (int i = 0; i < t.length; i++) {
- if(t[i] == '\n') { ps++; }
- }
- assertEquals(3, ps);
- }
-
- /**
- * Tests getting the text out of a complex file
- */
- public void testGetComplexText() throws Exception {
- HXFWordExtractor extractor =
- new HXFWordExtractor(xmlB.getPackage());
- extractor.getText();
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- char euro = '\u20ac';
- System.err.println("'"+text.substring(text.length() - 20) + "'");
-
- // Check contents
- assertTrue(text.startsWith(
- " \n(V) ILLUSTRATIVE CASES\n\n"
- ));
- assertTrue(text.endsWith(
- "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
- ));
-
- // Check number of paragraphs
- int ps = 0;
- char[] t = text.toCharArray();
- for (int i = 0; i < t.length; i++) {
- if(t[i] == '\n') { ps++; }
- }
- assertEquals(79, ps);
- }
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf;
+
+import java.io.File;
+
+import org.apache.poi.hxf.HXFDocument;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackagePart;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
+
+import junit.framework.TestCase;
+
+public class TestHSLFXML extends TestCase {
+ private File sampleFile;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ sampleFile = new File(
+ System.getProperty("HSLF.testdata.path") +
+ File.separator + "sample.pptx"
+ );
+ }
+
+ public void testContainsMainContentType() throws Exception {
+ Package pack = HXFDocument.openPackage(sampleFile);
+
+ boolean found = false;
+ for(PackagePart part : pack.getParts()) {
+ if(part.getContentType().equals(HSLFXML.MAIN_CONTENT_TYPE)) {
+ found = true;
+ }
+ System.out.println(part);
+ }
+ assertTrue(found);
+ }
+
+ public void testOpen() throws Exception {
+ HXFDocument.openPackage(sampleFile);
+
+ HSLFXML xml;
+
+ // With the finalised uri, should be fine
+ xml = new HSLFXML(
+ HXFDocument.openPackage(sampleFile)
+ );
+
+ // Check the core
+ assertNotNull(xml.getPresentation());
+
+ // Check it has some slides
+ assertTrue(
+ xml.getSlideReferences().sizeOfSldIdArray() > 0
+ );
+ assertTrue(
+ xml.getSlideMasterReferences().sizeOfSldMasterIdArray() > 0
+ );
+ }
+
+ public void testSlideBasics() throws Exception {
+ HSLFXML xml = new HSLFXML(
+ HXFDocument.openPackage(sampleFile)
+ );
+
+ // Should have 1 master
+ assertEquals(1, xml.getSlideMasterReferences().sizeOfSldMasterIdArray());
+ assertEquals(1, xml.getSlideMasterReferences().getSldMasterIdArray().length);
+
+ // Should have three sheets
+ assertEquals(2, xml.getSlideReferences().sizeOfSldIdArray());
+ assertEquals(2, xml.getSlideReferences().getSldIdArray().length);
+
+ // Check they're as expected
+ CTSlideIdListEntry[] slides = xml.getSlideReferences().getSldIdArray();
+ assertEquals(256, slides[0].getId());
+ assertEquals(257, slides[1].getId());
+ assertEquals("rId2", slides[0].getId2());
+ assertEquals("rId3", slides[1].getId2());
+
+ // Now get those objects
+ assertNotNull(xml.getSlide(slides[0]));
+ assertNotNull(xml.getSlide(slides[1]));
+
+ // And check they have notes as expected
+ assertNotNull(xml.getNotes(slides[0]));
+ assertNotNull(xml.getNotes(slides[1]));
+
+ // And again for the master
+ CTSlideMasterIdListEntry[] masters =
+ xml.getSlideMasterReferences().getSldMasterIdArray();
+ assertEquals(2147483648l, masters[0].getId());
+ assertEquals("rId1", masters[0].getId2());
+ assertNotNull(xml.getSlideMaster(masters[0]));
+ }
+
+ public void testMetadataBasics() throws Exception {
+ HSLFXML xml = new HSLFXML(
+ HXFDocument.openPackage(sampleFile)
+ );
+
+ assertNotNull(xml.getCoreProperties());
+ assertNotNull(xml.getExtendedProperties());
+
+ assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
+ assertEquals(0, xml.getExtendedProperties().getCharacters());
+ assertEquals(0, xml.getExtendedProperties().getLines());
+
+ assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
+ assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFPowerPointExtractor
+ */
+public class TestHXFPowerPointExtractor extends TestCase {
+ /**
+ * A simple file
+ */
+ private HSLFXML xmlA;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ File fileA = new File(
+ System.getProperty("HSLF.testdata.path") +
+ File.separator + "sample.pptx"
+ );
+
+ xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
+ }
+
+ /**
+ * Get text out of the simple file
+ */
+ public void testGetSimpleText() throws Exception {
+ new HXFPowerPointExtractor(xmlA.getPackage());
+ new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
+
+ HXFPowerPointExtractor extractor =
+ new HXFPowerPointExtractor(xmlA.getPackage());
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Check Basics
+ assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
+ assertTrue(text.endsWith("amet\n\n"));
+
+ // Just slides, no notes
+ text = extractor.getText(true, false);
+ assertEquals(
+ "Lorem ipsum dolor sit amet\n" +
+ "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+ "\n" +
+ "Lorem ipsum dolor sit amet\n" +
+ "Lorem\n" +
+ "ipsum\n" +
+ "dolor\n" +
+ "sit\n" +
+ "amet\n" +
+ "\n", text
+ );
+
+ // Just notes, no slides
+ text = extractor.getText(false, true);
+ assertEquals(
+ "\n\n\n\n", text
+ );
+
+ // Both
+ text = extractor.getText(true, true);
+ assertEquals(
+ "Lorem ipsum dolor sit amet\n" +
+ "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+ "\n\n\n" +
+ "Lorem ipsum dolor sit amet\n" +
+ "Lorem\n" +
+ "ipsum\n" +
+ "dolor\n" +
+ "sit\n" +
+ "amet\n" +
+ "\n\n\n", text
+ );
+
+ // Via set defaults
+ extractor.setSlidesByDefault(false);
+ extractor.setNotesByDefault(true);
+ text = extractor.getText();
+ assertEquals(
+ "\n\n\n\n", text
+ );
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import java.io.File;
+
+import org.apache.poi.hxf.HXFDocument;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackagePart;
+
+import junit.framework.TestCase;
+
+public class TestHWPFXML extends TestCase {
+ private File sampleFile;
+ private File complexFile;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ sampleFile = new File(
+ System.getProperty("HWPF.testdata.path") +
+ File.separator + "sample.docx"
+ );
+ complexFile = new File(
+ System.getProperty("HWPF.testdata.path") +
+ File.separator + "IllustrativeCases.docx"
+ );
+ }
+
+ public void testContainsMainContentType() throws Exception {
+ Package pack = HXFDocument.openPackage(sampleFile);
+
+ boolean found = false;
+ for(PackagePart part : pack.getParts()) {
+ if(part.getContentType().equals(HWPFXML.MAIN_CONTENT_TYPE)) {
+ found = true;
+ }
+ System.out.println(part);
+ }
+ assertTrue(found);
+ }
+
+ public void testOpen() throws Exception {
+ HXFDocument.openPackage(sampleFile);
+ HXFDocument.openPackage(complexFile);
+
+ HWPFXML xml;
+
+ // Simple file
+ xml = new HWPFXML(
+ HXFDocument.openPackage(sampleFile)
+ );
+ // Check it has key parts
+ assertNotNull(xml.getDocument());
+ assertNotNull(xml.getDocumentBody());
+ assertNotNull(xml.getStyle());
+
+ // Complex file
+ xml = new HWPFXML(
+ HXFDocument.openPackage(complexFile)
+ );
+ assertNotNull(xml.getDocument());
+ assertNotNull(xml.getDocumentBody());
+ assertNotNull(xml.getStyle());
+ }
+
+ public void testMetadataBasics() throws Exception {
+ HWPFXML xml = new HWPFXML(
+ HXFDocument.openPackage(sampleFile)
+ );
+ assertNotNull(xml.getCoreProperties());
+ assertNotNull(xml.getExtendedProperties());
+
+ assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
+ assertEquals(1315, xml.getExtendedProperties().getCharacters());
+ assertEquals(10, xml.getExtendedProperties().getLines());
+
+ assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
+ assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
+ }
+
+ public void testMetadataComplex() throws Exception {
+ HWPFXML xml = new HWPFXML(
+ HXFDocument.openPackage(complexFile)
+ );
+ assertNotNull(xml.getCoreProperties());
+ assertNotNull(xml.getExtendedProperties());
+
+ assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
+ assertEquals(5184, xml.getExtendedProperties().getCharacters());
+ assertEquals(0, xml.getExtendedProperties().getLines());
+
+ assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
+ assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFWordExtractor
+ */
+public class TestHXFWordExtractor extends TestCase {
+ /**
+ * A very simple file
+ */
+ private HWPFXML xmlA;
+ /**
+ * A fairly complex file
+ */
+ private HWPFXML xmlB;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ File fileA = new File(
+ System.getProperty("HWPF.testdata.path") +
+ File.separator + "sample.docx"
+ );
+ File fileB = new File(
+ System.getProperty("HWPF.testdata.path") +
+ File.separator + "IllustrativeCases.docx"
+ );
+
+ xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
+ xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
+ }
+
+ /**
+ * Get text out of the simple file
+ */
+ public void testGetSimpleText() throws Exception {
+ new HXFWordExtractor(xmlA.getPackage());
+ new HXFWordExtractor(new HWPFXMLDocument(xmlA));
+
+ HXFWordExtractor extractor =
+ new HXFWordExtractor(xmlA.getPackage());
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Check contents
+ assertTrue(text.startsWith(
+ "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
+ ));
+ assertTrue(text.endsWith(
+ "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
+ ));
+
+ // Check number of paragraphs
+ int ps = 0;
+ char[] t = text.toCharArray();
+ for (int i = 0; i < t.length; i++) {
+ if(t[i] == '\n') { ps++; }
+ }
+ assertEquals(3, ps);
+ }
+
+ /**
+ * Tests getting the text out of a complex file
+ */
+ public void testGetComplexText() throws Exception {
+ HXFWordExtractor extractor =
+ new HXFWordExtractor(xmlB.getPackage());
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ char euro = '\u20ac';
+ System.err.println("'"+text.substring(text.length() - 20) + "'");
+
+ // Check contents
+ assertTrue(text.startsWith(
+ " \n(V) ILLUSTRATIVE CASES\n\n"
+ ));
+ assertTrue(text.endsWith(
+ "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
+ ));
+
+ // Check number of paragraphs
+ int ps = 0;
+ char[] t = text.toCharArray();
+ for (int i = 0; i < t.length; i++) {
+ if(t[i] == '\n') { ps++; }
+ }
+ assertEquals(79, ps);
+ }
+}