]> source.dussan.org Git - poi.git/commitdiff
add TOC support
authorSergey Vladimirov <sergey@apache.org>
Wed, 26 Oct 2011 11:52:59 +0000 (11:52 +0000)
committerSergey Vladimirov <sergey@apache.org>
Wed, 26 Oct 2011 11:52:59 +0000 (11:52 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1189145 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java

index 5d35b202467f0b8b2c214383a04a82d394908eeb..af4c0be1f79d968e43fb045d210d85ec2297c145 100644 (file)
@@ -78,6 +78,13 @@ public abstract class AbstractWordConverter
             this.structure = field;
         }
 
+        Structure( int start, int end )
+        {
+            this.start = start;
+            this.end = end;
+            this.structure = null;
+        }
+
         public int compareTo( Structure o )
         {
             return start < o.start ? -1 : start == o.start ? 0 : 1;
@@ -102,6 +109,15 @@ public abstract class AbstractWordConverter
     private static final POILogger logger = POILogFactory
             .getLogger( AbstractWordConverter.class );
 
+    private static final Pattern PATTERN_HYPERLINK_EXTERNAL = Pattern
+            .compile( "^[ \\t\\r\\n]*HYPERLINK \"(.*)\".*$" );
+
+    private static final Pattern PATTERN_HYPERLINK_LOCAL = Pattern
+            .compile( "^[ \\t\\r\\n]*HYPERLINK \\\\l \"(.*)\"[ ](.*)$" );
+
+    private static final Pattern PATTERN_PAGEREF = Pattern
+            .compile( "^[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h.*$" );
+
     private static final byte SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE = 2;
 
     private static final byte SPECCHAR_DRAWN_OBJECT = 8;
@@ -291,17 +307,38 @@ public abstract class AbstractWordConverter
             }
 
             // TODO: dead fields?
+            int skipUntil = -1;
             for ( int c = 0; c < range.numCharacterRuns(); c++ )
             {
                 CharacterRun characterRun = range.getCharacterRun( c );
                 if ( characterRun == null )
                     throw new AssertionError();
+                if ( characterRun.getStartOffset() < skipUntil )
+                    continue;
+                String text = characterRun.text();
+                if ( text == null || text.length() == 0
+                        || text.charAt( 0 ) != FIELD_BEGIN_MARK )
+                    continue;
+
                 Field aliveField = ( (HWPFDocument) wordDocument ).getFields()
                         .getFieldByStartOffset( FieldsDocumentPart.MAIN,
                                 characterRun.getStartOffset() );
                 if ( aliveField != null )
                 {
                     addToStructures( structures, new Structure( aliveField ) );
+                    skipUntil = aliveField.getFieldEndOffset() + 1;
+                }
+                else
+                {
+                    int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(
+                            wordDocument, range, c );
+                    if ( separatorEnd != null )
+                    {
+                        addToStructures( structures, new Structure(
+                                characterRun.getStartOffset(),
+                                separatorEnd[1] + 1 ) );
+                        c = separatorEnd[1];
+                    }
                 }
             }
         }
@@ -562,6 +599,39 @@ public abstract class AbstractWordConverter
             Element currentBlock, Range range, int currentTableLevel,
             int beginMark, int separatorMark, int endMark )
     {
+        if ( beginMark + 1 < separatorMark && separatorMark + 1 < endMark )
+        {
+            Range formulaRange = new Range( range.getCharacterRun(
+                    beginMark + 1 ).getStartOffset(), range.getCharacterRun(
+                    separatorMark - 1 ).getEndOffset(), range )
+            {
+                @Override
+                public String toString()
+                {
+                    return "Dead field formula subrange: " + super.toString();
+                }
+            };
+            Range valueRange = new Range( range.getCharacterRun(
+                    separatorMark + 1 ).getStartOffset(), range
+                    .getCharacterRun( endMark - 1 ).getEndOffset(), range )
+            {
+                @Override
+                public String toString()
+                {
+                    return "Dead field value subrange: " + super.toString();
+                }
+            };
+            String formula = formulaRange.text();
+            final Matcher matcher = PATTERN_HYPERLINK_LOCAL.matcher( formula );
+            if ( matcher.matches() )
+            {
+                String localref = matcher.group( 1 );
+                processPageref( wordDocument, currentBlock, valueRange,
+                        currentTableLevel, localref );
+                return;
+            }
+        }
+
         StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
         for ( int i = beginMark; i <= endMark; i++ )
         {
@@ -706,9 +776,7 @@ public abstract class AbstractWordConverter
             if ( firstSubrange != null )
             {
                 String formula = firstSubrange.text();
-                Pattern pagerefPattern = Pattern
-                        .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
-                Matcher matcher = pagerefPattern.matcher( formula );
+                Matcher matcher = PATTERN_PAGEREF.matcher( formula );
                 if ( matcher.find() )
                 {
                     String pageref = matcher.group( 1 );
@@ -756,10 +824,8 @@ public abstract class AbstractWordConverter
             if ( firstSubrange != null )
             {
                 String formula = firstSubrange.text();
-                Pattern hyperlinkPattern = Pattern
-                        .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
-                Matcher matcher = hyperlinkPattern.matcher( formula );
-                if ( matcher.find() )
+                Matcher matcher = PATTERN_HYPERLINK_EXTERNAL.matcher( formula );
+                if ( matcher.matches() )
                 {
                     String hyperlink = matcher.group( 1 );
                     processHyperlink( wordDocument, currentBlock,
@@ -767,6 +833,30 @@ public abstract class AbstractWordConverter
                             currentTableLevel, hyperlink );
                     return;
                 }
+                matcher.usePattern( PATTERN_HYPERLINK_LOCAL );
+                if ( matcher.matches() )
+                {
+                    String hyperlink = matcher.group( 1 );
+                    Range textRange = null;
+                    String text = matcher.group( 2 );
+                    if ( AbstractWordUtils.isNotEmpty( text ) )
+                    {
+                        textRange = new Range( firstSubrange.getStartOffset()
+                                + matcher.start( 2 ),
+                                firstSubrange.getStartOffset()
+                                        + matcher.end( 2 ), firstSubrange )
+                        {
+                            @Override
+                            public String toString()
+                            {
+                                return "Local hyperlink text";
+                            }
+                        };
+                    }
+                    processPageref( wordDocument, currentBlock, textRange,
+                            currentTableLevel, hyperlink );
+                    return;
+                }
             }
             break;
         }
@@ -817,13 +907,13 @@ public abstract class AbstractWordConverter
 
     }
 
+    protected abstract void processImage( Element currentBlock,
+            boolean inlined, Picture picture, String url );
+
     @Internal
     protected abstract void processImageWithoutPicturesManager(
             Element currentBlock, boolean inlined, Picture picture );
 
-    protected abstract void processImage( Element currentBlock,
-            boolean inlined, Picture picture, String url );
-
     protected abstract void processLineBreak( Element block,
             CharacterRun characterRun );
 
@@ -1021,6 +1111,20 @@ public abstract class AbstractWordConverter
 
     protected int tryDeadField( HWPFDocumentCore wordDocument, Range range,
             int currentTableLevel, int beginMark, Element currentBlock )
+    {
+        int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(
+                wordDocument, range, beginMark );
+        if ( separatorEnd == null )
+            return beginMark;
+
+        processDeadField( wordDocument, currentBlock, range, currentTableLevel,
+                beginMark, separatorEnd[0], separatorEnd[1] );
+
+        return separatorEnd[1];
+    }
+
+    private int[] tryDeadField_lookupFieldSeparatorEnd(
+            HWPFDocumentCore wordDocument, Range range, int beginMark )
     {
         int separatorMark = -1;
         int endMark = -1;
@@ -1032,28 +1136,24 @@ public abstract class AbstractWordConverter
             if ( text.getBytes().length == 0 )
                 continue;
 
-            if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
+            final byte firstByte = text.getBytes()[0];
+            if ( firstByte == FIELD_BEGIN_MARK )
             {
-                // nested?
-                Field possibleField = processDeadField( wordDocument, range,
-                        currentTableLevel, characterRun.getStartOffset(),
-                        currentBlock );
-                if ( possibleField != null )
-                {
-                    c = possibleField.getFieldEndOffset();
-                }
-                else
+                int[] nested = tryDeadField_lookupFieldSeparatorEnd(
+                        wordDocument, range, c );
+                if ( nested != null )
                 {
-                    continue;
+                    c = nested[1];
                 }
+                continue;
             }
 
-            if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
+            if ( firstByte == FIELD_SEPARATOR_MARK )
             {
                 if ( separatorMark != -1 )
                 {
-                    // double;
-                    return beginMark;
+                    // double; incorrect format
+                    return null;
                 }
 
                 separatorMark = c;
@@ -1065,22 +1165,18 @@ public abstract class AbstractWordConverter
                 if ( endMark != -1 )
                 {
                     // double;
-                    return beginMark;
+                    return null;
                 }
 
                 endMark = c;
                 break;
             }
-
         }
 
         if ( separatorMark == -1 || endMark == -1 )
-            return beginMark;
-
-        processDeadField( wordDocument, currentBlock, range, currentTableLevel,
-                beginMark, separatorMark, endMark );
+            return null;
 
-        return endMark;
+        return new int[] { separatorMark, endMark };
     }
 
 }