]> source.dussan.org Git - poi.git/commitdiff
Tag as 3.0.1-RC3 tags/REL_3_0_1_RC3 REL_3_0_1_RC3
authorNick Burch <nick@apache.org>
Thu, 28 Jun 2007 11:43:11 +0000 (11:43 +0000)
committerNick Burch <nick@apache.org>
Thu, 28 Jun 2007 11:43:11 +0000 (11:43 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/tags/REL_3_0_1_RC3@551531 13f79535-47bb-0310-9956-ffa450edef68

24 files changed:
legal/NOTICE
src/documentation/content/xdocs/book.xml
src/documentation/content/xdocs/changes.xml
src/documentation/content/xdocs/hdgf/book.xml [new file with mode: 0644]
src/documentation/content/xdocs/hdgf/index.xml [new file with mode: 0755]
src/documentation/content/xdocs/hslf/book.xml
src/documentation/content/xdocs/hslf/index.xml
src/documentation/content/xdocs/hssf/how-to.xml
src/documentation/content/xdocs/hwpf/index.xml
src/documentation/content/xdocs/hwpf/quick-guide.xml
src/documentation/content/xdocs/index.xml
src/documentation/content/xdocs/status.xml
src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkHeaderV11.java
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkSeparator.java
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java
src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java
src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java
src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java
src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java
src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java [new file with mode: 0644]
src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java

index d417a360a77f11b3a3f7933f513de8bae3d97488..190d974632a470a0b4e72403e29378e88e65fef6 100644 (file)
@@ -1,5 +1,16 @@
-Apache Jakarta POI
+Apache POI
 Copyright 2001-2007 The Apache Software Foundation
 
 This product includes software developed by
 The Apache Software Foundation (http://www.apache.org/).
+
+
+Unit testing support is provided by JUnit, under the 
+Common Public License Version 1.0: 
+       http://www.opensource.org/licenses/cpl.php
+See http://www.junit.org/
+
+Small parts of the POI component HDGF are based on VSDump,
+and are under the GNU General Public Licence version 3 (GPL v3):
+       http://gplv3.fsf.org/
+See http://www.gnome.ru/projects/vsdump_en.html
index 4666d7765a082cedb9dc013da25876e8e5a79659..a0f10c0dbed07990474904a0589dca73d0d6b32a 100644 (file)
@@ -39,6 +39,7 @@
         <menu-item label="HWPF" href="hwpf/index.html"/>
         <menu-item label="HPSF" href="hpsf/index.html"/>
         <menu-item label="HSLF" href="hslf/index.html"/>
+        <menu-item label="HDGF" href="hdgf/index.html"/>
                <menu-item label="POI-Ruby" href="poi-ruby.html"/>
         <menu-item label="POI-Utils" href="utils/index.html"/>
         <menu-item label="Download" href="ext:download"/>
index 3783e8428fdcb7434fbef8d5d15c60d12b9c6c59..697395f8fdc9ed362a639100ef4a08c3025483ff 100644 (file)
@@ -35,7 +35,7 @@
         <person id="YK" name="Yegor Kozlov" email="yegor@apache.org"/>
     </devs>
 
-        <release version="3.0.1-FINAL" date="2007-06-15">
+        <release version="3.0.1-FINAL" date="2007-07-05">
             <action dev="POI-DEVELOPERS" type="fix">Administrative updates to the Maven POMs, and the release artificat build process</action>
             <action dev="POI-DEVELOPERS" type="fix">23951 - [PATCH] Fix for HSSF setSheetOrder and tab names</action>
             <action dev="POI-DEVELOPERS" type="fix">42524 - [PATCH] Better HSLF support for problem shape groups</action>
@@ -44,6 +44,9 @@
             <action dev="POI-DEVELOPERS" type="add">Additional HSLF support for Title and Slide Master Sheets</action>
             <action dev="POI-DEVELOPERS" type="fix">42474 - [PATCH] Improved HSLF note to slide matching, and a NPE</action>
             <action dev="POI-DEVELOPERS" type="fix">42481 - [PATCH] Tweak some HSLF exceptions, to make it clearer what you're catching</action>
+            <action dev="POI-DEVELOPERS" type="fix">42667 - [PATCH] Fix for HSLF writing of files with tables</action>
+            <action dev="POI-DEVELOPERS" type="add">Improved way of detecting HSSF cells that contain dates, isADateFormat</action>
+            <action dev="POI-DEVELOPERS" type="add">Initial, read-only support for Visio documents, as HDGF</action>
         </release>
 
         <release version="3.0-FINAL" date="2007-05-18">
diff --git a/src/documentation/content/xdocs/hdgf/book.xml b/src/documentation/content/xdocs/hdgf/book.xml
new file mode 100644 (file)
index 0000000..fb37a33
--- /dev/null
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<!--
+   ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   ====================================================================
+-->
+<!DOCTYPE book PUBLIC "-//APACHE//DTD Cocoon Documentation Book V1.0//EN" "../dtd/book-cocoon-v10.dtd">
+
+<book software="POI Project"
+    title="HGDF"
+    copyright="@year@ POI Project">
+
+    <menu label="Apache POI">
+        <menu-item label="Top" href="../index.html"/>
+    </menu>
+
+    <menu label="HDGF">
+        <menu-item label="Overview" href="index.html"/>
+       </menu>
+       
+</book>
diff --git a/src/documentation/content/xdocs/hdgf/index.xml b/src/documentation/content/xdocs/hdgf/index.xml
new file mode 100755 (executable)
index 0000000..f14bb1e
--- /dev/null
@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   ====================================================================
+-->
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
+
+<document>
+    <header>
+        <title>POI-HDGF - Java API To Access Microsoft Visio Format Files</title>
+        <subtitle>Overview</subtitle>
+        <authors>
+            <person name="Nick Burch" email="nick at apache dot org"/>
+        </authors>
+    </header>
+
+    <body>
+        <section>
+            <title>Overview</title>
+
+            <p>HDGF is the POI Project's pure Java implementation of the Visio file format.</p>
+            <p>Currently, HDGF provides a low-level, read-only api for 
+              accessing Visio documents. It also provides a 
+              <link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/src/org/apache/poi/hdgf/extractor/">way</link>
+              to extract the textual content from a file.
+            </p>
+                       <p>At this time, there is no <em>usermodel</em> api or similar,
+                        only low level access to the streams, chunks and chunk commands.
+                        Users are advised to check the unit tests to see how everything
+                        works. They are also well advised to read the documentation
+                        supplied with 
+                        <link href="http://www.gnome.ru/projects/vsdump_en.html">vsdump</link>
+                        to get a feel for how Visio files are structured.</p>
+                       <p>To get a feel for the contents of a file, and to track down
+                        where data of interest is stored, HDGF comes with
+                        <link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/src/org/apache/poi/hdgf/dev/">VSDDumper</link>
+                        to print out the contents of the file. Users should also make
+                        use of 
+                        <link href="http://www.gnome.ru/projects/vsdump_en.html">vsdump</link>
+                        to probe the structure of files.</p>
+            <note> 
+                This code currently lives the 
+                <link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link> 
+                of the POI SVN repository. 
+                               Ensure that you have the scratchpad jar or the scratchpad 
+                build area in your
+                               classpath before experimenting with this code.
+                       </note>
+
+                       <section>
+                               <title>Steps required for write support</title>
+                               <p>Currently, HDGF is only able to read visio files, it is
+                                not able to write them back out again. We believe the
+                                following are the steps that would need to be taken to
+                                implement it.</p>
+                               <ol>
+                                <li>Re-write the decompression support in LZW4HDGF to be
+                                 less opaque, and also under the ASL.</li>
+                                <li>Add compression support to the new LZw4HDGF.</li>
+                                <li>Have HDGF just write back the raw bytes it read in, and
+                                 have a test to ensure the file is un-changed.</li>
+                                <li>Have HDGF generate the bytes to write out from the
+                                 Stream stores, using the compressed data as appropriate,
+                                 without re-compressing. Plus test to ensure file is
+                                 un-changed.</li>
+                                <li>Have HDGF generate the bytes to write out from the
+                                 Stream stores, re-compressing any streams that were 
+                  decompressed. Plus test to ensure file is un-changed.</li>
+                                <li>Have HDGF re-generate the offsets in pointers for the
+                                 locations of the streams. Plus test to ensure file is
+                                 un-changed.</li>
+                                <li>Have HDGF re-generate the bytes for all the chunks, from
+                                 the chunk commands. Tests to ensure the chunks are 
+                                 serialized properly, and then that the file is un-changed</li>
+                                <li>Alter the data of one command, but keep it the same
+                                 length, and check visio can open the file when written 
+                                 out.</li>
+                                <li>Alter the data of one command, to a new length, and
+                                 check that visio can open the file when written out.</li>
+                               </ol>
+                       </section>
+        </section>
+    </body>
+</document>
index 0eb4f8cb182c5890b4d09af8e9a3fda6dd9d50b9..8ccf5c1bc43102cf565f10f984831a61ade2a217 100644 (file)
@@ -20,7 +20,7 @@
 <!DOCTYPE book PUBLIC "-//APACHE//DTD Cocoon Documentation Book V1.0//EN" "../dtd/book-cocoon-v10.dtd">
 
 <book software="POI Project"
-    title="HSSF"
+    title="HSLF"
     copyright="@year@ POI Project">
 
     <menu label="Apache POI">
index 779a279d1674a05ece6f0cda28ab042ac6f02f31..16a3885d8296eed42984a1bd7add38c6c899e9e5 100755 (executable)
             <title>Overview</title>
 
             <p>HSLF is the POI Project's pure Java implementation of the Powerpoint file format.</p>
-            <p>HSSF provides a way to read powerpoint presentations, and extract text from it.
+            <p>HSLF provides a way to read powerpoint presentations, and extract text from it.
             It also provides some (currently limited) edit capabilities.
             </p>
             <note> 
                 This code currently lives the 
-                <link href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/">scratchpad area</link> 
+                <link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link> 
                 of the POI SVN repository. 
                                Ensure that you have the scratchpad jar or the scratchpad 
                 build area in your
index cc578afec543b361944241f322918dd87679edad..a4ac41209d57f60c2ca3ed04167547cbefeb1df6 100644 (file)
@@ -460,7 +460,7 @@ some of the rows or cells. It can be found at
 <code>/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</code>,
 and may be called on the command line, or from within your own code.
 The latest version is always available from
-<link href="http://svn.apache.org/repos/asf/jakarta/poi/trunk/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/">subversion</link>.
+<link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/">subversion</link>.
 </p>
 <p>
 <em>This code is currently in the scratchpad section, so you will either
index 1268facbeefb3fdffb5c170491c0b29ff564252e..15568696170a33a63852aad366f6ff4e416d39ee 100644 (file)
@@ -38,7 +38,7 @@
     to pure Java.</p>
 
   <p>HWPF is still in early development. It is in the <link
-     href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/">
+     href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">
      scratchpad section of the SVN.</link> You will need to ensure you
      either have a recent SVN checkout, or a recent SVN nightly build
      (including the scratchpad jar!)</p>
index 197922f07e1f8b606604cbdb2840ffddd97bbf1c..bf046258e7f65192e1e14857e28796e00ef761c1 100644 (file)
@@ -30,7 +30,7 @@
 
     <body>
                <p>HWPF is still in early development. It is in the <link
-       href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/">
+       href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">
                scratchpad section of the SVN.</link> You will need to ensure you
                either have a recent SVN checkout, or a recent SVN nightly build
                (including the scratchpad jar!)</p>
@@ -68,7 +68,7 @@ can then get text and other properties.
                <section><title>Further Examples</title>
                <p>For now, the best source of additional examples is in the unit 
                tests. <link
-       href="http://svn.apache.org/viewvc/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/">
+       href="http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/">
                Browse the HWPF unit tests.</link>
                </p>
                </section>
index da2d5ed3a88b81c33a56ea097de53474c08db2f4..5968f8084ae38761b3cdecdb44286ede8be0fa89 100644 (file)
                <link href="http://www.apache.org/dyn/closer.cgi/poi/release/">download</link>
                the source and binaries from your
                <link href="http://www.apache.org/dyn/closer.cgi/poi/release/">local mirror</link>.</p>
+      <p>We would also like to confirm that verion 3.0 of Apache POI does
+               <em>not</em> contain any viruses. Users of broken virus checkers
+               which do detect a 94 byte file, sci_cec.db, as containing one are
+               advised to contact their vendor for a fix.</p>
        </section>
 
     <section><title>Purpose</title>
          development. Jump in!</p>
         </section>
         <section><title>HSLF for PowerPoint Documents</title>
-       <p>HWSL is our port of the Microsoft PowerPoint 97(-2003) file format to pure
+       <p>HSLF is our port of the Microsoft PowerPoint 97(-2003) file format to pure
          Java. It supports read and write capabilities of some, but not yet all
       of the core records. Please see <link
            href="./hslf/index.html">the HSLF project page for more
            information</link>.</p>
         </section>
+        <section><title>HDGF for Visio Documents</title>
+       <p>HDGF is our port of the Microsoft Viso 97(-2003) file format to pure
+         Java. It currently only supports reading at a very low level, and
+      simple text extraction. Please see <link
+           href="./hdgf/index.html">the HDGF project page for more
+           information</link>.</p>
+        </section>
         <section><title>HPSF for Document Properties</title>
        <p>HPSF is our port of the OLE 2 property set format to pure
          Java. Property sets are mostly use to store a document's properties
index ef5c5aaeb449df2b28765669df16937366082d51..b236f22880de54cc83bee076d634e4006e007773 100644 (file)
@@ -32,7 +32,7 @@
     </developers>
 
     <changes>
-        <release version="3.0.1-FINAL" date="2007-06-15">
+        <release version="3.0.1-FINAL" date="2007-07-05">
             <action dev="POI-DEVELOPERS" type="fix">Administrative updates to the Maven POMs, and the release artificat build process</action>
             <action dev="POI-DEVELOPERS" type="fix">23951 - [PATCH] Fix for HSSF setSheetOrder and tab names</action>
             <action dev="POI-DEVELOPERS" type="fix">42524 - [PATCH] Better HSLF support for problem shape groups</action>
@@ -41,6 +41,9 @@
             <action dev="POI-DEVELOPERS" type="add">Additional HSLF support for Title and Slide Master Sheets</action>
             <action dev="POI-DEVELOPERS" type="fix">42474 - [PATCH] Improved HSLF note to slide matching, and a NPE</action>
             <action dev="POI-DEVELOPERS" type="fix">42481 - [PATCH] Tweak some HSLF exceptions, to make it clearer what you're catching</action>
+            <action dev="POI-DEVELOPERS" type="fix">42667 - [PATCH] Fix for HSLF writing of files with tables</action>
+            <action dev="POI-DEVELOPERS" type="add">Improved way of detecting HSSF cells that contain dates, isADateFormat</action>
+            <action dev="POI-DEVELOPERS" type="add">Initial, read-only support for Visio documents, as HDGF</action>
         </release>
 
         <release version="3.0-FINAL" date="2007-05-18">
index 54c37b3e8304a22ffbdc80418dcca4b5a34de66a..5928927c4d6f029f4ed80e44ff24b940850e42b9 100644 (file)
@@ -20,6 +20,9 @@ import java.util.ArrayList;
 
 import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.apache.poi.util.StringUtil;
 
 /**
  * Base of all chunks, which hold data, flags etc
@@ -44,6 +47,9 @@ public class Chunk {
        /** The name of the chunk, as found from the commandDefinitions */
        private String name;
        
+       /** For logging warnings about the structure of the file */
+       private POILogger logger = POILogFactory.getLogger(Chunk.class);
+       
        public Chunk(ChunkHeader header, ChunkTrailer trailer, ChunkSeparator separator, byte[] contents) {
                this.header = header;
                this.trailer = trailer;
@@ -148,7 +154,9 @@ public class Chunk {
                        
                        // Check we seem to have enough data
                        if(offset >= contents.length) {
-                               System.err.println("Command offset " + offset + " past end of data at " + contents.length);
+                               logger.log(POILogger.WARN, 
+                                               "Command offset " + offset + " past end of data at " + contents.length
+                               );
                                continue;
                        }
                
@@ -167,9 +175,27 @@ public class Chunk {
                                                LittleEndian.getDouble(contents, offset)
                                );
                                break;
+                       case 12:
+                               // A Little Endian String
+                               // Starts 8 bytes into the data segment
+                               // Ends at end of data, or 00 00
+                               int startsAt = 8;
+                               int endsAt = startsAt;
+                               for(int j=startsAt; j<contents.length-1 && endsAt == startsAt; j++) {
+                                       if(contents[j] == 0 && contents[j+1] == 0) {
+                                               endsAt = j;
+                                       }
+                               }
+                               if(endsAt == startsAt) {
+                                       endsAt = contents.length;
+                               }
+                               
+                               int strLen = (endsAt-startsAt) / 2;
+                               command.value = StringUtil.getFromUnicodeLE(contents, startsAt, strLen);
+                               break;
                        case 25:
                                command.value = new Short(
-                                               LittleEndian.getShort(contents, offset)
+                                       LittleEndian.getShort(contents, offset)
                                );
                                break;
                        case 26:
@@ -188,7 +214,8 @@ public class Chunk {
                                break;
                                
                        default:
-                               //System.err.println("Warning - Command of type " + type + " not processed!");
+                               logger.log(POILogger.INFO, 
+                                               "Command of type " + type + " not processed!");
                        }
                        
                        // Add to the array
index efac0d357458b684c86e8dc886c23ccd5c2cd164..fe0fc91a4dc02f19ffd44ed182a2586290d01000 100644 (file)
@@ -24,6 +24,9 @@ import java.util.ArrayList;
 import java.util.Hashtable;
 import java.util.StringTokenizer;
 
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
 /**
  * Factor class to create the appropriate chunks, which
  *  needs the version of the file to process the chunk header
@@ -42,6 +45,9 @@ public class ChunkFactory {
        private static String chunkTableName = 
                "/org/apache/poi/hdgf/chunks/chunks_parse_cmds.tbl";
        
+       /** For logging problems we spot with the file */
+       private POILogger logger = POILogFactory.getLogger(ChunkFactory.class);
+       
        public ChunkFactory(int version) throws IOException {
                this.version = version;
                
@@ -107,7 +113,8 @@ public class ChunkFactory {
                // Check we have enough data, and tweak the header size
                //  as required
                if(endOfDataPos > data.length) {
-                       System.err.println("Header called for " + header.getLength() +" bytes, but that would take us passed the end of the data!");
+                       logger.log(POILogger.WARN,
+                               "Header called for " + header.getLength() +" bytes, but that would take us passed the end of the data!");
                        
                        endOfDataPos = data.length;
                        header.length = data.length - offset - header.getSizeInBytes();
index 51eca5649c7be7429fe95b27015067a8b6cf563a..c77a249204bbbaa2d4c6445e3220a98440dee64f 100644 (file)
@@ -24,6 +24,10 @@ public class ChunkHeaderV11 extends ChunkHeaderV6 {
         * Does the chunk have a separator?
         */
        public boolean hasSeparator() {
+               // For some reason, there are two types that don't have a 
+               //  separator despite the flags that indicate they do
+               if(type == 0x1f || type == 0xc9) { return false; }
+               
                // If there's a trailer, there's a separator
                if(hasTrailer()) { return true; }
 
index 7098f17ea68f12e5210d199f6f53f5a1412916d1..5ce4097446915130e20ad6c57dd07b75f43ff4f5 100644 (file)
@@ -27,4 +27,8 @@ public class ChunkSeparator {
                separatorData = new byte[4];
                System.arraycopy(data, offset, separatorData, 0, 4);
        }
+       
+       public String toString() {
+               return "<ChunkSeparator of length " + separatorData.length + ">";
+       }
 }
index a610b49b14a82a50400db9b880a309df1c93d4b7..a590732466b9b58ddbd2a354ff0acf5263f27c85 100644 (file)
@@ -26,4 +26,8 @@ public class ChunkTrailer {
                trailerData = new byte[8];
                System.arraycopy(data, offset, trailerData, 0, 8);
        }
+       
+       public String toString() {
+               return "<ChunkTrailer of length " + trailerData.length + ">";
+       }
 }
index 3c20e4f3ffa64e5f44346dca348a0c7219aca632..614b9259a0cdada28c326f5b2f8fb2cd9d9e2295 100644 (file)
@@ -70,6 +70,11 @@ public class VSDDumper {
                                " - " + Integer.toHexString(ptr.getFormat()));
                System.out.println(ind + "  Length is\t" + ptr.getLength() +
                                " - " + Integer.toHexString(ptr.getLength()));
+               if(ptr.destinationCompressed()) {
+                       int decompLen = stream._getContentsLength();
+                       System.out.println(ind + "  DC.Length is\t" + decompLen +
+                                       " - " + Integer.toHexString(decompLen));
+               }
                System.out.println(ind + "  Compressed is\t" + ptr.destinationCompressed());
                System.out.println(ind + "  Stream is\t" + stream.getClass().getName());
                
@@ -100,6 +105,9 @@ public class VSDDumper {
                        for(int i=0; i<cs.getChunks().length; i++) {
                                Chunk chunk = cs.getChunks()[i];
                                System.out.println(ind2 + "" + chunk.getName());
+                               System.out.println(ind2 + "  Length is " + chunk._getContents().length + " (" + Integer.toHexString(chunk._getContents().length) + ")");
+                               System.out.println(ind2 + "  OD Size is " + chunk.getOnDiskSize() + " (" + Integer.toHexString(chunk.getOnDiskSize()) + ")");
+                               System.out.println(ind2 + "  T / S is " + chunk.getTrailer() + " / " + chunk.getSeparator());
                                System.out.println(ind2 + "  Holds " + chunk.getCommands().length + " commands");
                                for(int j=0; j<chunk.getCommands().length; j++) {
                                        Command command = chunk.getCommands()[j];
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
new file mode 100644 (file)
index 0000000..b2c4ee3
--- /dev/null
@@ -0,0 +1,114 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hdgf.extractor;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+
+import org.apache.poi.hdgf.HDGFDiagram;
+import org.apache.poi.hdgf.chunks.Chunk.Command;
+import org.apache.poi.hdgf.streams.ChunkStream;
+import org.apache.poi.hdgf.streams.PointerContainingStream;
+import org.apache.poi.hdgf.streams.Stream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Class to find all the text in a Visio file, and return it.
+ * Can opperate on the command line (outputs to stdout), or
+ *  can return the text for you (eg for use with Lucene).
+ */
+public class VisioTextExtractor {
+       private HDGFDiagram hdgf;
+       private POIFSFileSystem fs;
+
+       public VisioTextExtractor(HDGFDiagram hdgf) {
+               this.hdgf = hdgf;
+       }
+       public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
+               this(new HDGFDiagram(fs));
+               this.fs = fs;
+       }
+       public VisioTextExtractor(InputStream inp) throws IOException {
+               this(new POIFSFileSystem(inp));
+       }
+       
+       /**
+        * Locates all the text entries in the file, and returns their
+        *  contents.
+        */
+       public String[] getAllText() {
+               ArrayList text = new ArrayList();
+               for(int i=0; i<hdgf.getTopLevelStreams().length; i++) {
+                       findText(hdgf.getTopLevelStreams()[i], text);
+               }
+               return (String[])text.toArray( new String[text.size()] );
+       }
+       private void findText(Stream stream, ArrayList text) {
+               if(stream instanceof PointerContainingStream) {
+                       PointerContainingStream ps = (PointerContainingStream)stream;
+                       for(int i=0; i<ps.getPointedToStreams().length; i++) {
+                               findText(ps.getPointedToStreams()[i], text);
+                       }
+               }
+               if(stream instanceof ChunkStream) {
+                       ChunkStream cs = (ChunkStream)stream;
+                       for(int i=0; i<cs.getChunks().length; i++) {
+                               if(cs.getChunks()[i] != null && 
+                                               cs.getChunks()[i].getName() != null &&
+                                               cs.getChunks()[i].getName().equals("Text")) {
+                                       // First command
+                                       Command cmd = cs.getChunks()[i].getCommands()[0];
+                                       if(cmd != null && cmd.getValue() != null) {
+                                               text.add( cmd.getValue().toString() );
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Returns the textual contents of the file.
+        */
+       public String getText() {
+               StringBuffer text = new StringBuffer();
+               String[] allText = getAllText();
+               for(int i=0; i<allText.length; i++) {
+                       text.append(allText[i]);
+                       if(!allText[i].endsWith("\r") &&
+                                       !allText[i].endsWith("\n")) {
+                               text.append("\n");
+                       }
+               }
+               return text.toString();
+       }
+       
+       public static void main(String[] args) throws Exception {
+               if(args.length == 0) {
+                       System.err.println("Use:");
+                       System.err.println("   VisioTextExtractor <file.vsd>");
+                       System.exit(1);
+               }
+               
+               VisioTextExtractor extractor = 
+                       new VisioTextExtractor(new FileInputStream(args[0]));
+               
+               // Print not PrintLn as already has \n added to it
+               System.out.print(extractor.getText());
+       }
+}
index 75b6beefd1a04ea732c9b4c960ecb4af1bb86617..a59fe43ff924900667b3619b91ea107ac9ea2bd4 100644 (file)
@@ -43,6 +43,11 @@ public class ChunkStream extends Stream {
        public void findChunks() {
                ArrayList chunksA = new ArrayList();
                
+               if(getPointer().getOffset() == 0x64b3) {
+                       int i = 0;
+                       i++;
+               }
+               
                int pos = 0;
                byte[] contents = getStore().getContents();
                while(pos < contents.length) {
index 35aa7e5291a8ec8d860377347be1ff0a57936661..163fa83d9a19104259506a56ea349316c2154a9f 100644 (file)
@@ -83,7 +83,7 @@ public abstract class Stream {
                        return new ChunkStream(pointer, store, chunkFactory); 
                }
                else if(pointer.destinationHasStrings()) {
-                       return new StringsStream(pointer, store);
+                       return new StringsStream(pointer, store, chunkFactory);
                }
                
                // Give up and return a generic one
index 2688b156e989037e28546cd1e1ca2c2af8ba3664..b23ff921493b84a5ab9968fa4b4873bdc78b5d94 100644 (file)
@@ -16,13 +16,16 @@ limitations under the License.
 ==================================================================== */
 package org.apache.poi.hdgf.streams;
 
+import org.apache.poi.hdgf.chunks.ChunkFactory;
 import org.apache.poi.hdgf.pointers.Pointer;
 
 /**
- * A Stream which holds Strings
+ * A Stream which holds Strings. This is just another kind
+ *  of ChunkStream, it seems
  */
 public class StringsStream extends Stream {
-       protected StringsStream(Pointer pointer, StreamStore store) {
+       protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) {
                super(pointer, store);
+//             super(pointer, store, chunkFactory);
        }
 }
diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java b/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java
new file mode 100644 (file)
index 0000000..a6541e9
--- /dev/null
@@ -0,0 +1,107 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hdgf.extractor;
+
+import java.io.ByteArrayOutputStream;
+import java.io.FileInputStream;
+import java.io.PrintStream;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.hdgf.HDGFDiagram;
+import org.apache.poi.hdgf.chunks.Chunk;
+import org.apache.poi.hdgf.chunks.ChunkFactory;
+import org.apache.poi.hdgf.pointers.Pointer;
+import org.apache.poi.hdgf.pointers.PointerFactory;
+import org.apache.poi.hssf.record.formula.eval.StringOperationEval;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+public class TestVisioExtractor extends TestCase {
+       private String filename;
+       protected void setUp() throws Exception {
+               String dirname = System.getProperty("HDGF.testdata.path");
+               filename = dirname + "/Test_Visio-Some_Random_Text.vsd";
+       }
+       
+       /**
+        * Test the 3 different ways of creating one
+        */
+       public void testCreation() throws Exception {
+               VisioTextExtractor extractor;
+               
+               extractor = new VisioTextExtractor(new FileInputStream(filename));
+               assertNotNull(extractor);
+               assertNotNull(extractor.getAllText());
+               assertEquals(3, extractor.getAllText().length);
+               
+               extractor = new VisioTextExtractor(
+                               new POIFSFileSystem(
+                                               new FileInputStream(filename)
+                               )
+               );
+               assertNotNull(extractor);
+               assertNotNull(extractor.getAllText());
+               assertEquals(3, extractor.getAllText().length);
+               
+               extractor = new VisioTextExtractor(
+                       new HDGFDiagram(
+                               new POIFSFileSystem(
+                                               new FileInputStream(filename)
+                               )
+                       )
+               );
+               assertNotNull(extractor);
+               assertNotNull(extractor.getAllText());
+               assertEquals(3, extractor.getAllText().length);
+       }
+       
+       public void testExtraction() throws Exception {
+               VisioTextExtractor extractor =
+                       new VisioTextExtractor(new FileInputStream(filename));
+               
+               // Check the array fetch
+               String[] text = extractor.getAllText();
+               assertNotNull(text);
+               assertEquals(3, text.length);
+               
+               assertEquals("Test View\n", text[0]);
+               assertEquals("I am a test view\n", text[1]);
+               assertEquals("Some random text, on a page\n", text[2]);
+               
+               // And the all-in fetch
+               String textS = extractor.getText();
+               assertEquals("Test View\nI am a test view\nSome random text, on a page\n", textS);
+       }
+       
+       public void testMain() throws Exception {
+               PrintStream oldOut = System.out;
+               ByteArrayOutputStream baos = new ByteArrayOutputStream();
+               PrintStream capture = new PrintStream(baos);
+               System.setOut(capture);
+               
+               VisioTextExtractor.main(new String[] {filename});
+               
+               // Put things back
+               System.setOut(oldOut);
+               
+               // Check
+               capture.flush();
+               String text = baos.toString();
+               assertEquals("Test View\nI am a test view\nSome random text, on a page\n", text);
+       }
+}
index c2d03f0c8992afc9e9d6a85ac2850a1de46c137c..5ea21d1a1c016e4e6501d5ed9a8b2cbbedac6a7c 100644 (file)
@@ -18,6 +18,7 @@ package org.apache.poi.hdgf.streams;
 
 import java.io.FileInputStream;
 
+import org.apache.poi.hdgf.chunks.Chunk;
 import org.apache.poi.hdgf.chunks.ChunkFactory;
 import org.apache.poi.hdgf.pointers.Pointer;
 import org.apache.poi.hdgf.pointers.PointerFactory;
@@ -202,4 +203,63 @@ public class TestStreamComplex extends StreamTest {
                assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream);
                assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream);
        }
+       
+       public void testChunkWithText() throws Exception {
+               // Parent ChunkStream is at 0x7194
+               // This is one of the last children of the trailer
+               Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt);
+               TrailerStream ts = (TrailerStream)
+                       Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory);
+               
+               ts.findChildren(contents);
+               
+               assertNotNull(ts.getChildPointers());
+               assertNotNull(ts.getPointedToStreams());
+               assertEquals(20, ts.getChildPointers().length);
+               assertEquals(20, ts.getPointedToStreams().length);
+               
+               assertEquals(0x7194, ts.getChildPointers()[13].getOffset());
+               assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset());
+               
+               PointerContainingStream ps7194 = (PointerContainingStream)
+                       ts.getPointedToStreams()[13];
+               
+               // First child is at 0x64b3
+               assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset());
+               assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset());
+               
+               ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0];
+               
+               // Should be 26bc bytes un-compressed
+               assertEquals(0x26bc, cs.getStore().getContents().length);
+               // And should have lots of children
+               assertEquals(131, cs.getChunks().length);
+               
+               // One of which is Text
+               boolean hasText = false;
+               for(int i=0; i<cs.getChunks().length; i++) {
+                       if(cs.getChunks()[i].getName().equals("Text")) {
+                               hasText = true;
+                       }
+               }
+               assertTrue(hasText);
+               // Which is the 72nd command
+               assertEquals("Text", cs.getChunks()[72].getName());
+               
+               Chunk text = cs.getChunks()[72];
+               assertEquals("Text", text.getName());
+               
+               // Which contains our text
+               assertEquals(1, text.getCommands().length);
+               assertEquals("Test View\n", text.getCommands()[0].getValue());
+               
+               
+               // Almost at the end is some more text
+               assertEquals("Text", cs.getChunks()[128].getName());
+               text = cs.getChunks()[128];
+               assertEquals("Text", text.getName());
+               
+               assertEquals(1, text.getCommands().length);
+               assertEquals("Some random text, on a page\n", text.getCommands()[0].getValue());
+       }
 }