diff options
author | Nick Burch <nick@apache.org> | 2007-06-28 11:43:11 +0000 |
---|---|---|
committer | Nick Burch <nick@apache.org> | 2007-06-28 11:43:11 +0000 |
commit | f957997c235454061481aa2b0644715a047ff00b (patch) | |
tree | 33f5d8225e904d66f5d13d9801a62e95fbd5fccb | |
parent | 296705e0b6148b983c14adb412c1f70df7b6f902 (diff) | |
download | poi-f957997c235454061481aa2b0644715a047ff00b.tar.gz poi-f957997c235454061481aa2b0644715a047ff00b.zip |
Tag as 3.0.1-RC3REL_3_0_1_RC3
git-svn-id: https://svn.apache.org/repos/asf/poi/tags/REL_3_0_1_RC3@551531 13f79535-47bb-0310-9956-ffa450edef68
24 files changed, 522 insertions, 18 deletions
diff --git a/legal/NOTICE b/legal/NOTICE index d417a360a7..190d974632 100644 --- a/legal/NOTICE +++ b/legal/NOTICE @@ -1,5 +1,16 @@ -Apache Jakarta POI +Apache POI Copyright 2001-2007 The Apache Software Foundation This product includes software developed by The Apache Software Foundation (http://www.apache.org/). + + +Unit testing support is provided by JUnit, under the +Common Public License Version 1.0: + http://www.opensource.org/licenses/cpl.php +See http://www.junit.org/ + +Small parts of the POI component HDGF are based on VSDump, +and are under the GNU General Public Licence version 3 (GPL v3): + http://gplv3.fsf.org/ +See http://www.gnome.ru/projects/vsdump_en.html diff --git a/src/documentation/content/xdocs/book.xml b/src/documentation/content/xdocs/book.xml index 4666d7765a..a0f10c0dbe 100644 --- a/src/documentation/content/xdocs/book.xml +++ b/src/documentation/content/xdocs/book.xml @@ -39,6 +39,7 @@ <menu-item label="HWPF" href="hwpf/index.html"/> <menu-item label="HPSF" href="hpsf/index.html"/> <menu-item label="HSLF" href="hslf/index.html"/> + <menu-item label="HDGF" href="hdgf/index.html"/> <menu-item label="POI-Ruby" href="poi-ruby.html"/> <menu-item label="POI-Utils" href="utils/index.html"/> <menu-item label="Download" href="ext:download"/> diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index 3783e8428f..697395f8fd 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -35,7 +35,7 @@ <person id="YK" name="Yegor Kozlov" email="yegor@apache.org"/> </devs> - <release version="3.0.1-FINAL" date="2007-06-15"> + <release version="3.0.1-FINAL" date="2007-07-05"> <action dev="POI-DEVELOPERS" type="fix">Administrative updates to the Maven POMs, and the release artificat build process</action> <action dev="POI-DEVELOPERS" type="fix">23951 - [PATCH] Fix for HSSF setSheetOrder and tab names</action> <action dev="POI-DEVELOPERS" type="fix">42524 - [PATCH] Better HSLF support for problem shape groups</action> @@ -44,6 +44,9 @@ <action dev="POI-DEVELOPERS" type="add">Additional HSLF support for Title and Slide Master Sheets</action> <action dev="POI-DEVELOPERS" type="fix">42474 - [PATCH] Improved HSLF note to slide matching, and a NPE</action> <action dev="POI-DEVELOPERS" type="fix">42481 - [PATCH] Tweak some HSLF exceptions, to make it clearer what you're catching</action> + <action dev="POI-DEVELOPERS" type="fix">42667 - [PATCH] Fix for HSLF writing of files with tables</action> + <action dev="POI-DEVELOPERS" type="add">Improved way of detecting HSSF cells that contain dates, isADateFormat</action> + <action dev="POI-DEVELOPERS" type="add">Initial, read-only support for Visio documents, as HDGF</action> </release> <release version="3.0-FINAL" date="2007-05-18"> diff --git a/src/documentation/content/xdocs/hdgf/book.xml b/src/documentation/content/xdocs/hdgf/book.xml new file mode 100644 index 0000000000..fb37a33a75 --- /dev/null +++ b/src/documentation/content/xdocs/hdgf/book.xml @@ -0,0 +1,34 @@ +<?xml version="1.0"?> +<!-- + ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + ==================================================================== +--> +<!DOCTYPE book PUBLIC "-//APACHE//DTD Cocoon Documentation Book V1.0//EN" "../dtd/book-cocoon-v10.dtd"> + +<book software="POI Project" + title="HGDF" + copyright="@year@ POI Project"> + + <menu label="Apache POI"> + <menu-item label="Top" href="../index.html"/> + </menu> + + <menu label="HDGF"> + <menu-item label="Overview" href="index.html"/> + </menu> + +</book> diff --git a/src/documentation/content/xdocs/hdgf/index.xml b/src/documentation/content/xdocs/hdgf/index.xml new file mode 100755 index 0000000000..f14bb1e766 --- /dev/null +++ b/src/documentation/content/xdocs/hdgf/index.xml @@ -0,0 +1,98 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + ==================================================================== +--> +<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd"> + +<document> + <header> + <title>POI-HDGF - Java API To Access Microsoft Visio Format Files</title> + <subtitle>Overview</subtitle> + <authors> + <person name="Nick Burch" email="nick at apache dot org"/> + </authors> + </header> + + <body> + <section> + <title>Overview</title> + + <p>HDGF is the POI Project's pure Java implementation of the Visio file format.</p> + <p>Currently, HDGF provides a low-level, read-only api for + accessing Visio documents. It also provides a + <link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/src/org/apache/poi/hdgf/extractor/">way</link> + to extract the textual content from a file. + </p> + <p>At this time, there is no <em>usermodel</em> api or similar, + only low level access to the streams, chunks and chunk commands. + Users are advised to check the unit tests to see how everything + works. They are also well advised to read the documentation + supplied with + <link href="http://www.gnome.ru/projects/vsdump_en.html">vsdump</link> + to get a feel for how Visio files are structured.</p> + <p>To get a feel for the contents of a file, and to track down + where data of interest is stored, HDGF comes with + <link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/src/org/apache/poi/hdgf/dev/">VSDDumper</link> + to print out the contents of the file. Users should also make + use of + <link href="http://www.gnome.ru/projects/vsdump_en.html">vsdump</link> + to probe the structure of files.</p> + <note> + This code currently lives the + <link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link> + of the POI SVN repository. + Ensure that you have the scratchpad jar or the scratchpad + build area in your + classpath before experimenting with this code. + </note> + + <section> + <title>Steps required for write support</title> + <p>Currently, HDGF is only able to read visio files, it is + not able to write them back out again. We believe the + following are the steps that would need to be taken to + implement it.</p> + <ol> + <li>Re-write the decompression support in LZW4HDGF to be + less opaque, and also under the ASL.</li> + <li>Add compression support to the new LZw4HDGF.</li> + <li>Have HDGF just write back the raw bytes it read in, and + have a test to ensure the file is un-changed.</li> + <li>Have HDGF generate the bytes to write out from the + Stream stores, using the compressed data as appropriate, + without re-compressing. Plus test to ensure file is + un-changed.</li> + <li>Have HDGF generate the bytes to write out from the + Stream stores, re-compressing any streams that were + decompressed. Plus test to ensure file is un-changed.</li> + <li>Have HDGF re-generate the offsets in pointers for the + locations of the streams. Plus test to ensure file is + un-changed.</li> + <li>Have HDGF re-generate the bytes for all the chunks, from + the chunk commands. Tests to ensure the chunks are + serialized properly, and then that the file is un-changed</li> + <li>Alter the data of one command, but keep it the same + length, and check visio can open the file when written + out.</li> + <li>Alter the data of one command, to a new length, and + check that visio can open the file when written out.</li> + </ol> + </section> + </section> + </body> +</document> diff --git a/src/documentation/content/xdocs/hslf/book.xml b/src/documentation/content/xdocs/hslf/book.xml index 0eb4f8cb18..8ccf5c1bc4 100644 --- a/src/documentation/content/xdocs/hslf/book.xml +++ b/src/documentation/content/xdocs/hslf/book.xml @@ -20,7 +20,7 @@ <!DOCTYPE book PUBLIC "-//APACHE//DTD Cocoon Documentation Book V1.0//EN" "../dtd/book-cocoon-v10.dtd"> <book software="POI Project" - title="HSSF" + title="HSLF" copyright="@year@ POI Project"> <menu label="Apache POI"> diff --git a/src/documentation/content/xdocs/hslf/index.xml b/src/documentation/content/xdocs/hslf/index.xml index 779a279d16..16a3885d82 100755 --- a/src/documentation/content/xdocs/hslf/index.xml +++ b/src/documentation/content/xdocs/hslf/index.xml @@ -34,12 +34,12 @@ <title>Overview</title> <p>HSLF is the POI Project's pure Java implementation of the Powerpoint file format.</p> - <p>HSSF provides a way to read powerpoint presentations, and extract text from it. + <p>HSLF provides a way to read powerpoint presentations, and extract text from it. It also provides some (currently limited) edit capabilities. </p> <note> This code currently lives the - <link href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/">scratchpad area</link> + <link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link> of the POI SVN repository. Ensure that you have the scratchpad jar or the scratchpad build area in your diff --git a/src/documentation/content/xdocs/hssf/how-to.xml b/src/documentation/content/xdocs/hssf/how-to.xml index cc578afec5..a4ac41209d 100644 --- a/src/documentation/content/xdocs/hssf/how-to.xml +++ b/src/documentation/content/xdocs/hssf/how-to.xml @@ -460,7 +460,7 @@ some of the rows or cells. It can be found at <code>/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</code>, and may be called on the command line, or from within your own code. The latest version is always available from -<link href="http://svn.apache.org/repos/asf/jakarta/poi/trunk/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/">subversion</link>. +<link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/">subversion</link>. </p> <p> <em>This code is currently in the scratchpad section, so you will either diff --git a/src/documentation/content/xdocs/hwpf/index.xml b/src/documentation/content/xdocs/hwpf/index.xml index 1268facbee..1556869617 100644 --- a/src/documentation/content/xdocs/hwpf/index.xml +++ b/src/documentation/content/xdocs/hwpf/index.xml @@ -38,7 +38,7 @@ to pure Java.</p> <p>HWPF is still in early development. It is in the <link - href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/"> + href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/"> scratchpad section of the SVN.</link> You will need to ensure you either have a recent SVN checkout, or a recent SVN nightly build (including the scratchpad jar!)</p> diff --git a/src/documentation/content/xdocs/hwpf/quick-guide.xml b/src/documentation/content/xdocs/hwpf/quick-guide.xml index 197922f07e..bf046258e7 100644 --- a/src/documentation/content/xdocs/hwpf/quick-guide.xml +++ b/src/documentation/content/xdocs/hwpf/quick-guide.xml @@ -30,7 +30,7 @@ <body> <p>HWPF is still in early development. It is in the <link - href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/"> + href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/"> scratchpad section of the SVN.</link> You will need to ensure you either have a recent SVN checkout, or a recent SVN nightly build (including the scratchpad jar!)</p> @@ -68,7 +68,7 @@ can then get text and other properties. <section><title>Further Examples</title> <p>For now, the best source of additional examples is in the unit tests. <link - href="http://svn.apache.org/viewvc/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/"> + href="http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/"> Browse the HWPF unit tests.</link> </p> </section> diff --git a/src/documentation/content/xdocs/index.xml b/src/documentation/content/xdocs/index.xml index da2d5ed3a8..5968f8084a 100644 --- a/src/documentation/content/xdocs/index.xml +++ b/src/documentation/content/xdocs/index.xml @@ -38,6 +38,10 @@ <link href="http://www.apache.org/dyn/closer.cgi/poi/release/">download</link> the source and binaries from your <link href="http://www.apache.org/dyn/closer.cgi/poi/release/">local mirror</link>.</p> + <p>We would also like to confirm that verion 3.0 of Apache POI does + <em>not</em> contain any viruses. Users of broken virus checkers + which do detect a 94 byte file, sci_cec.db, as containing one are + advised to contact their vendor for a fix.</p> </section> <section><title>Purpose</title> @@ -107,12 +111,19 @@ development. Jump in!</p> </section> <section><title>HSLF for PowerPoint Documents</title> - <p>HWSL is our port of the Microsoft PowerPoint 97(-2003) file format to pure + <p>HSLF is our port of the Microsoft PowerPoint 97(-2003) file format to pure Java. It supports read and write capabilities of some, but not yet all of the core records. Please see <link href="./hslf/index.html">the HSLF project page for more information</link>.</p> </section> + <section><title>HDGF for Visio Documents</title> + <p>HDGF is our port of the Microsoft Viso 97(-2003) file format to pure + Java. It currently only supports reading at a very low level, and + simple text extraction. Please see <link + href="./hdgf/index.html">the HDGF project page for more + information</link>.</p> + </section> <section><title>HPSF for Document Properties</title> <p>HPSF is our port of the OLE 2 property set format to pure Java. Property sets are mostly use to store a document's properties diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index ef5c5aaeb4..b236f22880 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -32,7 +32,7 @@ </developers> <changes> - <release version="3.0.1-FINAL" date="2007-06-15"> + <release version="3.0.1-FINAL" date="2007-07-05"> <action dev="POI-DEVELOPERS" type="fix">Administrative updates to the Maven POMs, and the release artificat build process</action> <action dev="POI-DEVELOPERS" type="fix">23951 - [PATCH] Fix for HSSF setSheetOrder and tab names</action> <action dev="POI-DEVELOPERS" type="fix">42524 - [PATCH] Better HSLF support for problem shape groups</action> @@ -41,6 +41,9 @@ <action dev="POI-DEVELOPERS" type="add">Additional HSLF support for Title and Slide Master Sheets</action> <action dev="POI-DEVELOPERS" type="fix">42474 - [PATCH] Improved HSLF note to slide matching, and a NPE</action> <action dev="POI-DEVELOPERS" type="fix">42481 - [PATCH] Tweak some HSLF exceptions, to make it clearer what you're catching</action> + <action dev="POI-DEVELOPERS" type="fix">42667 - [PATCH] Fix for HSLF writing of files with tables</action> + <action dev="POI-DEVELOPERS" type="add">Improved way of detecting HSSF cells that contain dates, isADateFormat</action> + <action dev="POI-DEVELOPERS" type="add">Initial, read-only support for Visio documents, as HDGF</action> </release> <release version="3.0-FINAL" date="2007-05-18"> diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java index 54c37b3e83..5928927c4d 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java @@ -20,6 +20,9 @@ import java.util.ArrayList; import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.apache.poi.util.StringUtil; /** * Base of all chunks, which hold data, flags etc @@ -44,6 +47,9 @@ public class Chunk { /** The name of the chunk, as found from the commandDefinitions */ private String name; + /** For logging warnings about the structure of the file */ + private POILogger logger = POILogFactory.getLogger(Chunk.class); + public Chunk(ChunkHeader header, ChunkTrailer trailer, ChunkSeparator separator, byte[] contents) { this.header = header; this.trailer = trailer; @@ -148,7 +154,9 @@ public class Chunk { // Check we seem to have enough data if(offset >= contents.length) { - System.err.println("Command offset " + offset + " past end of data at " + contents.length); + logger.log(POILogger.WARN, + "Command offset " + offset + " past end of data at " + contents.length + ); continue; } @@ -167,9 +175,27 @@ public class Chunk { LittleEndian.getDouble(contents, offset) ); break; + case 12: + // A Little Endian String + // Starts 8 bytes into the data segment + // Ends at end of data, or 00 00 + int startsAt = 8; + int endsAt = startsAt; + for(int j=startsAt; j<contents.length-1 && endsAt == startsAt; j++) { + if(contents[j] == 0 && contents[j+1] == 0) { + endsAt = j; + } + } + if(endsAt == startsAt) { + endsAt = contents.length; + } + + int strLen = (endsAt-startsAt) / 2; + command.value = StringUtil.getFromUnicodeLE(contents, startsAt, strLen); + break; case 25: command.value = new Short( - LittleEndian.getShort(contents, offset) + LittleEndian.getShort(contents, offset) ); break; case 26: @@ -188,7 +214,8 @@ public class Chunk { break; default: - //System.err.println("Warning - Command of type " + type + " not processed!"); + logger.log(POILogger.INFO, + "Command of type " + type + " not processed!"); } // Add to the array diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java index efac0d3574..fe0fc91a4d 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java @@ -24,6 +24,9 @@ import java.util.ArrayList; import java.util.Hashtable; import java.util.StringTokenizer; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + /** * Factor class to create the appropriate chunks, which * needs the version of the file to process the chunk header @@ -42,6 +45,9 @@ public class ChunkFactory { private static String chunkTableName = "/org/apache/poi/hdgf/chunks/chunks_parse_cmds.tbl"; + /** For logging problems we spot with the file */ + private POILogger logger = POILogFactory.getLogger(ChunkFactory.class); + public ChunkFactory(int version) throws IOException { this.version = version; @@ -107,7 +113,8 @@ public class ChunkFactory { // Check we have enough data, and tweak the header size // as required if(endOfDataPos > data.length) { - System.err.println("Header called for " + header.getLength() +" bytes, but that would take us passed the end of the data!"); + logger.log(POILogger.WARN, + "Header called for " + header.getLength() +" bytes, but that would take us passed the end of the data!"); endOfDataPos = data.length; header.length = data.length - offset - header.getSizeInBytes(); diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkHeaderV11.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkHeaderV11.java index 51eca5649c..c77a249204 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkHeaderV11.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkHeaderV11.java @@ -24,6 +24,10 @@ public class ChunkHeaderV11 extends ChunkHeaderV6 { * Does the chunk have a separator? */ public boolean hasSeparator() { + // For some reason, there are two types that don't have a + // separator despite the flags that indicate they do + if(type == 0x1f || type == 0xc9) { return false; } + // If there's a trailer, there's a separator if(hasTrailer()) { return true; } diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkSeparator.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkSeparator.java index 7098f17ea6..5ce4097446 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkSeparator.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkSeparator.java @@ -27,4 +27,8 @@ public class ChunkSeparator { separatorData = new byte[4]; System.arraycopy(data, offset, separatorData, 0, 4); } + + public String toString() { + return "<ChunkSeparator of length " + separatorData.length + ">"; + } } diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java index a610b49b14..a590732466 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java @@ -26,4 +26,8 @@ public class ChunkTrailer { trailerData = new byte[8]; System.arraycopy(data, offset, trailerData, 0, 8); } + + public String toString() { + return "<ChunkTrailer of length " + trailerData.length + ">"; + } } diff --git a/src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java b/src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java index 3c20e4f3ff..614b9259a0 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java @@ -70,6 +70,11 @@ public class VSDDumper { " - " + Integer.toHexString(ptr.getFormat())); System.out.println(ind + " Length is\t" + ptr.getLength() + " - " + Integer.toHexString(ptr.getLength())); + if(ptr.destinationCompressed()) { + int decompLen = stream._getContentsLength(); + System.out.println(ind + " DC.Length is\t" + decompLen + + " - " + Integer.toHexString(decompLen)); + } System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed()); System.out.println(ind + " Stream is\t" + stream.getClass().getName()); @@ -100,6 +105,9 @@ public class VSDDumper { for(int i=0; i<cs.getChunks().length; i++) { Chunk chunk = cs.getChunks()[i]; System.out.println(ind2 + "" + chunk.getName()); + System.out.println(ind2 + " Length is " + chunk._getContents().length + " (" + Integer.toHexString(chunk._getContents().length) + ")"); + System.out.println(ind2 + " OD Size is " + chunk.getOnDiskSize() + " (" + Integer.toHexString(chunk.getOnDiskSize()) + ")"); + System.out.println(ind2 + " T / S is " + chunk.getTrailer() + " / " + chunk.getSeparator()); System.out.println(ind2 + " Holds " + chunk.getCommands().length + " commands"); for(int j=0; j<chunk.getCommands().length; j++) { Command command = chunk.getCommands()[j]; diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java new file mode 100644 index 0000000000..b2c4ee37f9 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java @@ -0,0 +1,114 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hdgf.extractor; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; + +import org.apache.poi.hdgf.HDGFDiagram; +import org.apache.poi.hdgf.chunks.Chunk.Command; +import org.apache.poi.hdgf.streams.ChunkStream; +import org.apache.poi.hdgf.streams.PointerContainingStream; +import org.apache.poi.hdgf.streams.Stream; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * Class to find all the text in a Visio file, and return it. + * Can opperate on the command line (outputs to stdout), or + * can return the text for you (eg for use with Lucene). + */ +public class VisioTextExtractor { + private HDGFDiagram hdgf; + private POIFSFileSystem fs; + + public VisioTextExtractor(HDGFDiagram hdgf) { + this.hdgf = hdgf; + } + public VisioTextExtractor(POIFSFileSystem fs) throws IOException { + this(new HDGFDiagram(fs)); + this.fs = fs; + } + public VisioTextExtractor(InputStream inp) throws IOException { + this(new POIFSFileSystem(inp)); + } + + /** + * Locates all the text entries in the file, and returns their + * contents. + */ + public String[] getAllText() { + ArrayList text = new ArrayList(); + for(int i=0; i<hdgf.getTopLevelStreams().length; i++) { + findText(hdgf.getTopLevelStreams()[i], text); + } + return (String[])text.toArray( new String[text.size()] ); + } + private void findText(Stream stream, ArrayList text) { + if(stream instanceof PointerContainingStream) { + PointerContainingStream ps = (PointerContainingStream)stream; + for(int i=0; i<ps.getPointedToStreams().length; i++) { + findText(ps.getPointedToStreams()[i], text); + } + } + if(stream instanceof ChunkStream) { + ChunkStream cs = (ChunkStream)stream; + for(int i=0; i<cs.getChunks().length; i++) { + if(cs.getChunks()[i] != null && + cs.getChunks()[i].getName() != null && + cs.getChunks()[i].getName().equals("Text")) { + // First command + Command cmd = cs.getChunks()[i].getCommands()[0]; + if(cmd != null && cmd.getValue() != null) { + text.add( cmd.getValue().toString() ); + } + } + } + } + } + + /** + * Returns the textual contents of the file. + */ + public String getText() { + StringBuffer text = new StringBuffer(); + String[] allText = getAllText(); + for(int i=0; i<allText.length; i++) { + text.append(allText[i]); + if(!allText[i].endsWith("\r") && + !allText[i].endsWith("\n")) { + text.append("\n"); + } + } + return text.toString(); + } + + public static void main(String[] args) throws Exception { + if(args.length == 0) { + System.err.println("Use:"); + System.err.println(" VisioTextExtractor <file.vsd>"); + System.exit(1); + } + + VisioTextExtractor extractor = + new VisioTextExtractor(new FileInputStream(args[0])); + + // Print not PrintLn as already has \n added to it + System.out.print(extractor.getText()); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java b/src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java index 75b6beefd1..a59fe43ff9 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java @@ -43,6 +43,11 @@ public class ChunkStream extends Stream { public void findChunks() { ArrayList chunksA = new ArrayList(); + if(getPointer().getOffset() == 0x64b3) { + int i = 0; + i++; + } + int pos = 0; byte[] contents = getStore().getContents(); while(pos < contents.length) { diff --git a/src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java b/src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java index 35aa7e5291..163fa83d9a 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java @@ -83,7 +83,7 @@ public abstract class Stream { return new ChunkStream(pointer, store, chunkFactory); } else if(pointer.destinationHasStrings()) { - return new StringsStream(pointer, store); + return new StringsStream(pointer, store, chunkFactory); } // Give up and return a generic one diff --git a/src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java b/src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java index 2688b156e9..b23ff92149 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java @@ -16,13 +16,16 @@ limitations under the License. ==================================================================== */ package org.apache.poi.hdgf.streams; +import org.apache.poi.hdgf.chunks.ChunkFactory; import org.apache.poi.hdgf.pointers.Pointer; /** - * A Stream which holds Strings + * A Stream which holds Strings. This is just another kind + * of ChunkStream, it seems */ public class StringsStream extends Stream { - protected StringsStream(Pointer pointer, StreamStore store) { + protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) { super(pointer, store); +// super(pointer, store, chunkFactory); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java b/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java new file mode 100644 index 0000000000..a6541e9b0d --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java @@ -0,0 +1,107 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hdgf.extractor; + +import java.io.ByteArrayOutputStream; +import java.io.FileInputStream; +import java.io.PrintStream; + +import junit.framework.TestCase; + +import org.apache.poi.hdgf.HDGFDiagram; +import org.apache.poi.hdgf.chunks.Chunk; +import org.apache.poi.hdgf.chunks.ChunkFactory; +import org.apache.poi.hdgf.pointers.Pointer; +import org.apache.poi.hdgf.pointers.PointerFactory; +import org.apache.poi.hssf.record.formula.eval.StringOperationEval; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +public class TestVisioExtractor extends TestCase { + private String filename; + protected void setUp() throws Exception { + String dirname = System.getProperty("HDGF.testdata.path"); + filename = dirname + "/Test_Visio-Some_Random_Text.vsd"; + } + + /** + * Test the 3 different ways of creating one + */ + public void testCreation() throws Exception { + VisioTextExtractor extractor; + + extractor = new VisioTextExtractor(new FileInputStream(filename)); + assertNotNull(extractor); + assertNotNull(extractor.getAllText()); + assertEquals(3, extractor.getAllText().length); + + extractor = new VisioTextExtractor( + new POIFSFileSystem( + new FileInputStream(filename) + ) + ); + assertNotNull(extractor); + assertNotNull(extractor.getAllText()); + assertEquals(3, extractor.getAllText().length); + + extractor = new VisioTextExtractor( + new HDGFDiagram( + new POIFSFileSystem( + new FileInputStream(filename) + ) + ) + ); + assertNotNull(extractor); + assertNotNull(extractor.getAllText()); + assertEquals(3, extractor.getAllText().length); + } + + public void testExtraction() throws Exception { + VisioTextExtractor extractor = + new VisioTextExtractor(new FileInputStream(filename)); + + // Check the array fetch + String[] text = extractor.getAllText(); + assertNotNull(text); + assertEquals(3, text.length); + + assertEquals("Test View\n", text[0]); + assertEquals("I am a test view\n", text[1]); + assertEquals("Some random text, on a page\n", text[2]); + + // And the all-in fetch + String textS = extractor.getText(); + assertEquals("Test View\nI am a test view\nSome random text, on a page\n", textS); + } + + public void testMain() throws Exception { + PrintStream oldOut = System.out; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PrintStream capture = new PrintStream(baos); + System.setOut(capture); + + VisioTextExtractor.main(new String[] {filename}); + + // Put things back + System.setOut(oldOut); + + // Check + capture.flush(); + String text = baos.toString(); + assertEquals("Test View\nI am a test view\nSome random text, on a page\n", text); + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java b/src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java index c2d03f0c89..5ea21d1a1c 100644 --- a/src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java +++ b/src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java @@ -18,6 +18,7 @@ package org.apache.poi.hdgf.streams; import java.io.FileInputStream; +import org.apache.poi.hdgf.chunks.Chunk; import org.apache.poi.hdgf.chunks.ChunkFactory; import org.apache.poi.hdgf.pointers.Pointer; import org.apache.poi.hdgf.pointers.PointerFactory; @@ -202,4 +203,63 @@ public class TestStreamComplex extends StreamTest { assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream); assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream); } + + public void testChunkWithText() throws Exception { + // Parent ChunkStream is at 0x7194 + // This is one of the last children of the trailer + Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt); + TrailerStream ts = (TrailerStream) + Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory); + + ts.findChildren(contents); + + assertNotNull(ts.getChildPointers()); + assertNotNull(ts.getPointedToStreams()); + assertEquals(20, ts.getChildPointers().length); + assertEquals(20, ts.getPointedToStreams().length); + + assertEquals(0x7194, ts.getChildPointers()[13].getOffset()); + assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset()); + + PointerContainingStream ps7194 = (PointerContainingStream) + ts.getPointedToStreams()[13]; + + // First child is at 0x64b3 + assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset()); + assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset()); + + ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0]; + + // Should be 26bc bytes un-compressed + assertEquals(0x26bc, cs.getStore().getContents().length); + // And should have lots of children + assertEquals(131, cs.getChunks().length); + + // One of which is Text + boolean hasText = false; + for(int i=0; i<cs.getChunks().length; i++) { + if(cs.getChunks()[i].getName().equals("Text")) { + hasText = true; + } + } + assertTrue(hasText); + // Which is the 72nd command + assertEquals("Text", cs.getChunks()[72].getName()); + + Chunk text = cs.getChunks()[72]; + assertEquals("Text", text.getName()); + + // Which contains our text + assertEquals(1, text.getCommands().length); + assertEquals("Test View\n", text.getCommands()[0].getValue()); + + + // Almost at the end is some more text + assertEquals("Text", cs.getChunks()[128].getName()); + text = cs.getChunks()[128]; + assertEquals("Text", text.getName()); + + assertEquals(1, text.getCommands().length); + assertEquals("Some random text, on a page\n", text.getCommands()[0].getValue()); + } } |