]> source.dussan.org Git - poi.git/commitdiff
Bugzilla 51351: Word to XSL-FO converter
authorYegor Kozlov <yegor@apache.org>
Tue, 14 Jun 2011 08:53:00 +0000 (08:53 +0000)
committerYegor Kozlov <yegor@apache.org>
Tue, 14 Jun 2011 08:53:00 +0000 (08:53 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1135414 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/scratchpad/src/org/apache/poi/hwpf/extractor/NumberFormatter.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java [new file with mode: 0644]

index afff84bf6022c6a98ee702cd0c2d4754643ea3c9..c2b46ea71a17cb92191a98c692788adf33a95f7a 100644 (file)
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.8-beta4" date="2011-??-??">
+           <action dev="poi-developers" type="add">51351 - Word to XSL-FO converter</action>
            <action dev="poi-developers" type="add">50458 - Fixed missing shapeId in XSSF drawings </action>
            <action dev="poi-developers" type="add">51339 - Fixed arithmetic rounding in formula evaluation </action>
            <action dev="poi-developers" type="add">51356 - Support autoSizeColumn in SXSSF</action>
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/NumberFormatter.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/NumberFormatter.java
new file mode 100644 (file)
index 0000000..d4a2cc7
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ *  ====================================================================
+ *    Licensed to the Apache Software Foundation (ASF) under one or more
+ *    contributor license agreements.  See the NOTICE file distributed with
+ *    this work for additional information regarding copyright ownership.
+ *    The ASF licenses this file to You under the Apache License, Version 2.0
+ *    (the "License"); you may not use this file except in compliance with
+ *    the License.  You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ * ====================================================================
+ */
+
+package org.apache.poi.hwpf.extractor;
+
+/**
+ * Comment me
+ *
+ * @author Ryan Ackley
+ */
+public final class NumberFormatter {
+
+    private static String[] C_LETTERS = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i",
+            "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "x", "y", "z" };
+
+    private static String[] C_ROMAN = new String[] { "i", "ii", "iii", "iv", "v", "vi", "vii",
+            "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix",
+            "xx", "xxi", "xxii", "xxiii", "xxiv", "xxv", "xxvi", "xxvii", "xxviii", "xxix", "xxx",
+            "xxxi", "xxxii", "xxxiii", "xxxiv", "xxxv", "xxxvi", "xxxvii", "xxxvii", "xxxviii",
+            "xxxix", "xl", "xli", "xlii", "xliii", "xliv", "xlv", "xlvi", "xlvii", "xlviii",
+            "xlix", "l" };
+
+    private final static int T_ARABIC = 0;
+    private final static int T_LOWER_LETTER = 4;
+    private final static int T_LOWER_ROMAN = 2;
+    private final static int T_ORDINAL = 5;
+    private final static int T_UPPER_LETTER = 3;
+    private final static int T_UPPER_ROMAN = 1;
+
+    public static String getNumber(int num, int style) {
+        switch (style) {
+        case T_UPPER_ROMAN:
+            return C_ROMAN[num - 1].toUpperCase();
+        case T_LOWER_ROMAN:
+            return C_ROMAN[num - 1];
+        case T_UPPER_LETTER:
+            return C_LETTERS[num - 1].toUpperCase();
+        case T_LOWER_LETTER:
+            return C_LETTERS[num - 1];
+        case T_ARABIC:
+        case T_ORDINAL:
+        default:
+            return String.valueOf(num);
+        }
+    }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java
new file mode 100644 (file)
index 0000000..ccc5a2a
--- /dev/null
@@ -0,0 +1,642 @@
+/*
+ *  ====================================================================
+ *    Licensed to the Apache Software Foundation (ASF) under one or more
+ *    contributor license agreements.  See the NOTICE file distributed with
+ *    this work for additional information regarding copyright ownership.
+ *    The ASF licenses this file to You under the Apache License, Version 2.0
+ *    (the "License"); you may not use this file except in compliance with
+ *    the License.  You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ * ====================================================================
+ */
+
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.ListFormatOverride;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.SectionProperties;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableIterator;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+
+import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH;
+
+/**
+ * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
+ */
+public class WordToFoExtractor {
+
+    private static final byte BEL_MARK = 7;
+
+    private static final byte FIELD_BEGIN_MARK = 19;
+
+    private static final byte FIELD_END_MARK = 21;
+
+    private static final byte FIELD_SEPARATOR_MARK = 20;
+
+    private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
+
+    private static HWPFDocument loadDoc(File docFile) throws IOException {
+       final FileInputStream istream = new FileInputStream(docFile);
+       try {
+           return new HWPFDocument(istream);
+       } finally {
+           try {
+               istream.close();
+           } catch (Exception exc) {
+               // no op
+           }
+       }
+    }
+
+    static Document process(File docFile) throws Exception {
+       final HWPFDocument hwpfDocument = loadDoc(docFile);
+       WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
+               DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                       .newDocument());
+       wordToFoExtractor.processDocument(hwpfDocument);
+       return wordToFoExtractor.getDocument();
+    }
+
+    private final Document document;
+
+    private final Element layoutMasterSet;
+
+    private final Element root;
+
+    public WordToFoExtractor(Document document) throws Exception {
+       this.document = document;
+
+       root = document.createElementNS(NS_XSLFO, "fo:root");
+       document.appendChild(root);
+
+       layoutMasterSet = document.createElementNS(NS_XSLFO,
+               "fo:layout-master-set");
+       root.appendChild(layoutMasterSet);
+    }
+
+    protected Element addFlowToPageSequence(final Element pageSequence,
+           String flowName) {
+       final Element flow = document.createElementNS(NS_XSLFO, "fo:flow");
+       flow.setAttribute("flow-name", flowName);
+       pageSequence.appendChild(flow);
+
+       return flow;
+    }
+
+    protected Element addListItem(Element listBlock) {
+       Element result = createListItem();
+       listBlock.appendChild(result);
+       return result;
+    }
+
+    protected Element addListItemBody(Element listItem) {
+       Element result = createListItemBody();
+       listItem.appendChild(result);
+       return result;
+    }
+
+    protected Element addListItemLabel(Element listItem, String text) {
+       Element result = createListItemLabel(text);
+       listItem.appendChild(result);
+       return result;
+    }
+
+    protected Element addPageSequence(String pageMaster) {
+       final Element pageSequence = document.createElementNS(NS_XSLFO,
+               "fo:page-sequence");
+       pageSequence.setAttribute("master-reference", pageMaster);
+       root.appendChild(pageSequence);
+       return pageSequence;
+    }
+
+    protected Element addRegionBody(Element pageMaster) {
+       final Element regionBody = document.createElementNS(NS_XSLFO,
+               "fo:region-body");
+       pageMaster.appendChild(regionBody);
+
+       return regionBody;
+    }
+
+    protected Element addSimplePageMaster(String masterName) {
+       final Element simplePageMaster = document.createElementNS(NS_XSLFO,
+               "fo:simple-page-master");
+       simplePageMaster.setAttribute("master-name", masterName);
+       layoutMasterSet.appendChild(simplePageMaster);
+
+       return simplePageMaster;
+    }
+
+    protected Element addTable(Element flow) {
+       final Element table = document.createElementNS(NS_XSLFO, "fo:table");
+       flow.appendChild(table);
+       return table;
+    }
+
+    protected Element createBlock() {
+       return document.createElementNS(NS_XSLFO, "fo:block");
+    }
+
+    protected Element createExternalGraphic(String source) {
+       Element result = document.createElementNS(NS_XSLFO,
+               "fo:external-graphic");
+       result.setAttribute("src", "url('" + source + "')");
+       return result;
+    }
+
+    protected Element createInline() {
+       return document.createElementNS(NS_XSLFO, "fo:inline");
+    }
+
+    protected Element createLeader() {
+       return document.createElementNS(NS_XSLFO, "fo:leader");
+    }
+
+    protected Element createListBlock() {
+       return document.createElementNS(NS_XSLFO, "fo:list-block");
+    }
+
+    protected Element createListItem() {
+       return document.createElementNS(NS_XSLFO, "fo:list-item");
+    }
+
+    protected Element createListItemBody() {
+       return document.createElementNS(NS_XSLFO, "fo:list-item-body");
+    }
+
+    protected Element createListItemLabel(String text) {
+       Element result = document.createElementNS(NS_XSLFO,
+               "fo:list-item-label");
+       Element block = createBlock();
+       block.appendChild(document.createTextNode(text));
+       result.appendChild(block);
+       return result;
+    }
+
+    protected String createPageMaster(SectionProperties sep, String type,
+           int section) {
+       float height = sep.getYaPage() / TWIPS_PER_INCH;
+       float width = sep.getXaPage() / TWIPS_PER_INCH;
+       float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
+       float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH;
+       float topMargin = sep.getDyaTop() / TWIPS_PER_INCH;
+       float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH;
+
+       // add these to the header
+       String pageMasterName = type + "-page" + section;
+
+       Element pageMaster = addSimplePageMaster(pageMasterName);
+       pageMaster.setAttribute("page-height", height + "in");
+       pageMaster.setAttribute("page-width", width + "in");
+
+       Element regionBody = addRegionBody(pageMaster);
+       regionBody.setAttribute("margin", topMargin + "in " + rightMargin
+               + "in " + bottomMargin + "in " + leftMargin + "in");
+
+       /*
+        * 6.4.14 fo:region-body
+        *
+        * The values of the padding and border-width traits must be "0".
+        */
+       // WordToFoUtils.setBorder(regionBody, sep.getBrcTop(), "top");
+       // WordToFoUtils.setBorder(regionBody, sep.getBrcBottom(), "bottom");
+       // WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left");
+       // WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right");
+
+       if (sep.getCcolM1() > 0) {
+           regionBody.setAttribute("column-count", "" + (sep.getCcolM1() + 1));
+           if (sep.getFEvenlySpaced()) {
+               regionBody.setAttribute("column-gap",
+                       (sep.getDxaColumns() / TWIPS_PER_INCH) + "in");
+           } else {
+               regionBody.setAttribute("column-gap", "0.25in");
+           }
+       }
+
+       return pageMasterName;
+    }
+
+    protected Element createTableBody() {
+       return document.createElementNS(NS_XSLFO, "fo:table-body");
+    }
+
+    protected Element createTableCell() {
+       return document.createElementNS(NS_XSLFO, "fo:table-cell");
+    }
+
+    protected Element createTableHeader() {
+       return document.createElementNS(NS_XSLFO, "fo:table-header");
+    }
+
+    protected Element createTableRow() {
+       return document.createElementNS(NS_XSLFO, "fo:table-row");
+    }
+
+    protected Text createText(String data) {
+       return document.createTextNode(data);
+    }
+
+    public Document getDocument() {
+       return document;
+    }
+
+    public void processDocument(HWPFDocument hwpfDocument) {
+       final Range range = hwpfDocument.getRange();
+
+       for (int s = 0; s < range.numSections(); s++) {
+           processSection(hwpfDocument, range.getSection(s), s);
+       }
+    }
+
+    @SuppressWarnings("unused")
+    protected void processImage(Element currentBlock, Picture picture) {
+       // no default implementation -- skip
+    }
+
+    protected void processParagraph(HWPFDocument hwpfDocument,
+           Element parentFopElement, int currentTableLevel,
+           Paragraph paragraph, String bulletText) {
+       final Element block = createBlock();
+       parentFopElement.appendChild(block);
+
+       WordToFoUtils.setParagraphProperties(paragraph, block);
+
+       final int charRuns = paragraph.numCharacterRuns();
+
+       if (charRuns == 0) {
+           return;
+       }
+
+       final String pFontName;
+       final int pFontSize;
+       final boolean pBold;
+       final boolean pItalic;
+       {
+           CharacterRun characterRun = paragraph.getCharacterRun(0);
+           pFontSize = characterRun.getFontSize() / 2;
+           pFontName = characterRun.getFontName();
+           pBold = characterRun.isBold();
+           pItalic = characterRun.isItalic();
+       }
+       WordToFoUtils.setFontFamily(block, pFontName);
+       WordToFoUtils.setFontSize(block, pFontSize);
+       WordToFoUtils.setBold(block, pBold);
+       WordToFoUtils.setItalic(block, pItalic);
+
+       StringBuilder lineText = new StringBuilder();
+
+       if (WordToFoUtils.isNotEmpty(bulletText)) {
+           Element inline = createInline();
+           block.appendChild(inline);
+
+           Text textNode = createText(bulletText);
+           inline.appendChild(textNode);
+
+           lineText.append(bulletText);
+       }
+
+       for (int c = 0; c < charRuns; c++) {
+           CharacterRun characterRun = paragraph.getCharacterRun(c);
+
+           String text = characterRun.text();
+           if (text.getBytes().length == 0)
+               continue;
+
+           if (text.getBytes()[0] == FIELD_BEGIN_MARK) {
+               int skipTo = tryImageWithinField(hwpfDocument, paragraph, c,
+                       block);
+
+               if (skipTo != c) {
+                   c = skipTo;
+                   continue;
+               }
+               continue;
+           }
+           if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) {
+               continue;
+           }
+           if (text.getBytes()[0] == FIELD_END_MARK) {
+               continue;
+           }
+
+           if (characterRun.isSpecialCharacter() || characterRun.isObj()
+                   || characterRun.isOle2()) {
+               continue;
+           }
+
+           Element inline = createInline();
+           if (characterRun.isBold() != pBold) {
+               WordToFoUtils.setBold(inline, characterRun.isBold());
+           }
+           if (characterRun.isItalic() != pItalic) {
+               WordToFoUtils.setItalic(inline, characterRun.isItalic());
+           }
+           if (!WordToFoUtils.equals(characterRun.getFontName(), pFontName)) {
+               WordToFoUtils.setFontFamily(inline, characterRun.getFontName());
+           }
+           if (characterRun.getFontSize() / 2 != pFontSize) {
+               WordToFoUtils.setFontSize(inline,
+                       characterRun.getFontSize() / 2);
+           }
+           WordToFoUtils.setCharactersProperties(characterRun, inline);
+           block.appendChild(inline);
+
+           if (text.endsWith("\r")
+                   || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != 0))
+               text = text.substring(0, text.length() - 1);
+
+           Text textNode = createText(text);
+           inline.appendChild(textNode);
+
+           lineText.append(text);
+       }
+
+       if (lineText.toString().trim().length() == 0) {
+           Element leader = createLeader();
+           block.appendChild(leader);
+       }
+
+       return;
+    }
+
+    protected void processSection(HWPFDocument hwpfDocument, Section section,
+           int sectionCounter) {
+       String regularPage = createPageMaster(
+               WordToFoUtils.getSectionProperties(section), "page",
+               sectionCounter);
+
+       Element pageSequence = addPageSequence(regularPage);
+       Element flow = addFlowToPageSequence(pageSequence, "xsl-region-body");
+
+       processSectionParagraphes(hwpfDocument, flow, section, 0);
+    }
+
+    protected void processSectionParagraphes(HWPFDocument hwpfDocument,
+           Element flow, Range range, int currentTableLevel) {
+       final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
+       for (TableIterator tableIterator = WordToFoUtils.newTableIterator(
+               range, currentTableLevel + 1); tableIterator.hasNext();) {
+           Table next = tableIterator.next();
+           allTables.put(Integer.valueOf(next.getStartOffset()), next);
+       }
+
+       final ListTables listTables = hwpfDocument.getListTables();
+       int currentListInfo = 0;
+
+       final int paragraphs = range.numParagraphs();
+       for (int p = 0; p < paragraphs; p++) {
+           Paragraph paragraph = range.getParagraph(p);
+
+           if (allTables.containsKey(Integer.valueOf(paragraph
+                   .getStartOffset()))) {
+               Table table = allTables.get(Integer.valueOf(paragraph
+                       .getStartOffset()));
+               processTable(hwpfDocument, flow, table, currentTableLevel + 1);
+               continue;
+           }
+
+           if (paragraph.isInTable()
+                   && paragraph.getTableLevel() != currentTableLevel) {
+               continue;
+           }
+
+           if (paragraph.getIlfo() != currentListInfo) {
+               currentListInfo = paragraph.getIlfo();
+           }
+
+           if (currentListInfo != 0) {
+               final ListFormatOverride listFormatOverride = listTables
+                       .getOverride(paragraph.getIlfo());
+
+               String label = WordToFoUtils.getBulletText(listTables,
+                       paragraph, listFormatOverride.getLsid());
+
+               processParagraph(hwpfDocument, flow, currentTableLevel,
+                       paragraph, label);
+           } else {
+               processParagraph(hwpfDocument, flow, currentTableLevel,
+                       paragraph, WordToFoUtils.EMPTY);
+           }
+       }
+
+    }
+
+    protected void processTable(HWPFDocument hwpfDocument, Element flow,
+           Table table, int thisTableLevel) {
+       Element tableElement = addTable(flow);
+
+       Element tableHeader = createTableHeader();
+       Element tableBody = createTableBody();
+
+       final int tableRows = table.numRows();
+
+       int maxColumns = Integer.MIN_VALUE;
+       for (int r = 0; r < tableRows; r++) {
+           maxColumns = Math.max(maxColumns, table.getRow(r).numCells());
+       }
+
+       for (int r = 0; r < tableRows; r++) {
+           TableRow tableRow = table.getRow(r);
+
+           Element tableRowElement = createTableRow();
+           WordToFoUtils.setTableRowProperties(tableRow, tableRowElement);
+
+           final int rowCells = tableRow.numCells();
+           for (int c = 0; c < rowCells; c++) {
+               TableCell tableCell = tableRow.getCell(c);
+
+               if (tableCell.isMerged() && !tableCell.isFirstMerged())
+                   continue;
+
+               if (tableCell.isVerticallyMerged()
+                       && !tableCell.isFirstVerticallyMerged())
+                   continue;
+
+               Element tableCellElement = createTableCell();
+               WordToFoUtils.setTableCellProperties(tableRow, tableCell,
+                       tableCellElement, r == 0, r == tableRows - 1, c == 0,
+                       c == rowCells - 1);
+
+               if (tableCell.isFirstMerged()) {
+                   int count = 0;
+                   for (int c1 = c; c1 < rowCells; c1++) {
+                       TableCell nextCell = tableRow.getCell(c1);
+                       if (nextCell.isMerged())
+                           count++;
+                       if (!nextCell.isMerged())
+                           break;
+                   }
+                   tableCellElement.setAttribute("number-columns-spanned", ""
+                           + count);
+               } else {
+                   if (c == rowCells - 1 && c != maxColumns - 1) {
+                       tableCellElement.setAttribute("number-columns-spanned",
+                               "" + (maxColumns - c));
+                   }
+               }
+
+               if (tableCell.isFirstVerticallyMerged()) {
+                   int count = 0;
+                   for (int r1 = r; r1 < tableRows; r1++) {
+                       TableRow nextRow = table.getRow(r1);
+                       if (nextRow.numCells() < c)
+                           break;
+                       TableCell nextCell = nextRow.getCell(c);
+                       if (nextCell.isVerticallyMerged())
+                           count++;
+                       if (!nextCell.isVerticallyMerged())
+                           break;
+                   }
+                   tableCellElement.setAttribute("number-rows-spanned", ""
+                           + count);
+               }
+
+               processSectionParagraphes(hwpfDocument, tableCellElement,
+                       tableCell, thisTableLevel);
+
+               if (!tableCellElement.hasChildNodes()) {
+                   tableCellElement.appendChild(createBlock());
+               }
+
+               tableRowElement.appendChild(tableCellElement);
+           }
+
+           if (tableRow.isTableHeader()) {
+               tableHeader.appendChild(tableRowElement);
+           } else {
+               tableBody.appendChild(tableRowElement);
+           }
+       }
+
+       if (tableHeader.hasChildNodes()) {
+           tableElement.appendChild(tableHeader);
+       }
+       if (tableBody.hasChildNodes()) {
+           tableElement.appendChild(tableBody);
+       } else {
+           System.err.println("Table without body");
+       }
+    }
+
+    protected int tryImageWithinField(HWPFDocument hwpfDocument,
+           Paragraph paragraph, int beginMark, Element currentBlock) {
+       int separatorMark = -1;
+       int pictureMark = -1;
+       int endMark = -1;
+       for (int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++) {
+           CharacterRun characterRun = paragraph.getCharacterRun(c);
+
+           String text = characterRun.text();
+           if (text.getBytes().length == 0)
+               continue;
+
+           if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) {
+               if (separatorMark != -1) {
+                   // double;
+                   return beginMark;
+               }
+
+               separatorMark = c;
+               continue;
+           }
+
+           if (text.getBytes()[0] == FIELD_END_MARK) {
+               if (endMark != -1) {
+                   // double;
+                   return beginMark;
+               }
+
+               endMark = c;
+               break;
+           }
+
+           if (hwpfDocument.getPicturesTable().hasPicture(characterRun)) {
+               if (pictureMark != -1) {
+                   // double;
+                   return beginMark;
+               }
+
+               pictureMark = c;
+               continue;
+           }
+       }
+
+       if (separatorMark == -1 || pictureMark == -1 || endMark == -1)
+           return beginMark;
+
+       final CharacterRun pictureRun = paragraph.getCharacterRun(pictureMark);
+       final Picture picture = hwpfDocument.getPicturesTable().extractPicture(
+               pictureRun, true);
+       processImage(currentBlock, picture);
+
+       return endMark;
+    }
+
+
+    /**
+     * Java main() interface to interact with WordToFoExtractor
+     *
+     * <p>
+     *     Usage: WordToFoExtractor infile outfile
+     * </p>
+     * Where infile is an input .doc file ( Word 97-2007)
+     * which will be rendered as XSL-FO into outfile
+     *
+     */
+    public static void main(String[] args) {
+        if (args.length < 2) {
+            System.err.println("Usage: WordToFoExtractor <inputFile.doc> <saveTo.fo>");
+            return;
+        }
+
+        System.out.println("Converting " + args[0]);
+        System.out.println("Saving output to " + args[1]);
+        try {
+            Document doc = WordToFoExtractor.process(new File(args[0]));
+
+            FileWriter out = new FileWriter(args[1]);
+            DOMSource domSource = new DOMSource(doc);
+            StreamResult streamResult = new StreamResult(out);
+            TransformerFactory tf = TransformerFactory.newInstance();
+            Transformer serializer = tf.newTransformer();
+            serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");  // TODO set encoding from a command argument
+            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
+            serializer.transform(domSource, streamResult);
+            out.close();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoUtils.java
new file mode 100644 (file)
index 0000000..f973faa
--- /dev/null
@@ -0,0 +1,443 @@
+package org.apache.poi.hwpf.extractor;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
+
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.BorderCode;
+import org.apache.poi.hwpf.usermodel.CharacterProperties;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Section;
+import org.apache.poi.hwpf.usermodel.SectionProperties;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableIterator;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.w3c.dom.Element;
+
+public class WordToFoUtils {
+    static final String EMPTY = "";
+
+    public static final float TWIPS_PER_INCH = 1440.0f;
+
+    public static final int TWIPS_PER_PT = 20;
+
+    static boolean equals(String str1, String str2) {
+       return str1 == null ? str2 == null : str1.equals(str2);
+    }
+
+    public static String getBorderType(BorderCode borderCode) {
+       if (borderCode == null)
+           throw new IllegalArgumentException("borderCode is null");
+
+       switch (borderCode.getBorderType()) {
+       case 1:
+       case 2:
+           return "solid";
+       case 3:
+           return "double";
+       case 5:
+           return "solid";
+       case 6:
+           return "dotted";
+       case 7:
+       case 8:
+           return "dashed";
+       case 9:
+           return "dotted";
+       case 10:
+       case 11:
+       case 12:
+       case 13:
+       case 14:
+       case 15:
+       case 16:
+       case 17:
+       case 18:
+       case 19:
+           return "double";
+       case 20:
+           return "solid";
+       case 21:
+           return "double";
+       case 22:
+           return "dashed";
+       case 23:
+           return "dashed";
+       case 24:
+           return "ridge";
+       case 25:
+           return "grooved";
+       default:
+           return "solid";
+       }
+    }
+
+    public static String getBorderWidth(BorderCode borderCode) {
+       int lineWidth = borderCode.getLineWidth();
+       int pt = lineWidth / 8;
+       int pte = lineWidth - pt * 8;
+
+       StringBuilder stringBuilder = new StringBuilder();
+       stringBuilder.append(pt);
+       stringBuilder.append(".");
+       stringBuilder.append(1000 / 8 * pte);
+       stringBuilder.append("pt");
+       return stringBuilder.toString();
+    }
+
+    public static String getBulletText(ListTables listTables,
+           Paragraph paragraph, int listId) {
+       final ListLevel listLevel = listTables.getLevel(listId,
+               paragraph.getIlvl());
+
+       if (listLevel.getNumberText() == null)
+           return EMPTY;
+
+       StringBuffer bulletBuffer = new StringBuffer();
+       char[] xst = listLevel.getNumberText().toCharArray();
+       for (char element : xst) {
+           if (element < 9) {
+               ListLevel numLevel = listTables.getLevel(listId, element);
+
+               int num = numLevel.getStartAt();
+               bulletBuffer.append(NumberFormatter.getNumber(num,
+                       listLevel.getNumberFormat()));
+
+               if (numLevel == listLevel) {
+                   numLevel.setStartAt(numLevel.getStartAt() + 1);
+               }
+
+           } else {
+               bulletBuffer.append(element);
+           }
+       }
+
+       byte follow = getIxchFollow(listLevel);
+       switch (follow) {
+       case 0:
+           bulletBuffer.append("\t");
+           break;
+       case 1:
+           bulletBuffer.append(" ");
+           break;
+       default:
+           break;
+       }
+
+       return bulletBuffer.toString();
+    }
+
+    public static String getColor(int ico) {
+       switch (ico) {
+       case 1:
+           return "black";
+       case 2:
+           return "blue";
+       case 3:
+           return "cyan";
+       case 4:
+           return "green";
+       case 5:
+           return "magenta";
+       case 6:
+           return "red";
+       case 7:
+           return "yellow";
+       case 8:
+           return "white";
+       case 9:
+           return "darkblue";
+       case 10:
+           return "darkcyan";
+       case 11:
+           return "darkgreen";
+       case 12:
+           return "darkmagenta";
+       case 13:
+           return "darkred";
+       case 14:
+           return "darkyellow";
+       case 15:
+           return "darkgray";
+       case 16:
+           return "lightgray";
+       default:
+           return "black";
+       }
+    }
+
+    public static byte getIxchFollow(ListLevel listLevel) {
+       try {
+           Field field = ListLevel.class.getDeclaredField("_ixchFollow");
+           field.setAccessible(true);
+           return ((Byte) field.get(listLevel)).byteValue();
+       } catch (Exception exc) {
+           throw new Error(exc);
+       }
+    }
+
+    public static String getListItemNumberLabel(int number, int format) {
+
+       if (format != 0)
+           System.err.println("NYI: toListItemNumberLabel(): " + format);
+
+       return String.valueOf(number);
+    }
+
+    public static SectionProperties getSectionProperties(Section section) {
+       try {
+           Field field = Section.class.getDeclaredField("_props");
+           field.setAccessible(true);
+           return (SectionProperties) field.get(section);
+       } catch (Exception exc) {
+           throw new Error(exc);
+       }
+    }
+
+    static boolean isEmpty(String str) {
+       return str == null || str.length() == 0;
+    }
+
+    static boolean isNotEmpty(String str) {
+       return !isEmpty(str);
+    }
+
+    public static TableIterator newTableIterator(Range range, int level) {
+       try {
+           Constructor<TableIterator> constructor = TableIterator.class
+                   .getDeclaredConstructor(Range.class, int.class);
+           constructor.setAccessible(true);
+           return constructor.newInstance(range, Integer.valueOf(level));
+       } catch (Exception exc) {
+           throw new Error(exc);
+       }
+    }
+
+    public static void setBold(final Element element, final boolean bold) {
+       element.setAttribute("font-weight", bold ? "bold" : "normal");
+    }
+
+    public static void setBorder(Element element, BorderCode borderCode,
+           String where) {
+       if (element == null)
+           throw new IllegalArgumentException("element is null");
+
+       if (borderCode == null)
+           return;
+
+       if (isEmpty(where)) {
+           element.setAttribute("border-style", getBorderType(borderCode));
+           element.setAttribute("border-color",
+                   getColor(borderCode.getColor()));
+           element.setAttribute("border-width", getBorderWidth(borderCode));
+       } else {
+           element.setAttribute("border-" + where + "-style",
+                   getBorderType(borderCode));
+           element.setAttribute("border-" + where + "-color",
+                   getColor(borderCode.getColor()));
+           element.setAttribute("border-" + where + "-width",
+                   getBorderWidth(borderCode));
+       }
+    }
+
+    public static void setCharactersProperties(final CharacterRun characterRun,
+           final Element inline) {
+       final CharacterProperties clonedProperties = characterRun
+               .cloneProperties();
+       StringBuilder textDecorations = new StringBuilder();
+
+       setBorder(inline, clonedProperties.getBrc(), EMPTY);
+
+       if (characterRun.isCapitalized()) {
+           inline.setAttribute("text-transform", "uppercase");
+       }
+       if (characterRun.isHighlighted()) {
+           inline.setAttribute("background-color",
+                   getColor(clonedProperties.getIcoHighlight()));
+       }
+       if (characterRun.isStrikeThrough()) {
+           if (textDecorations.length() > 0)
+               textDecorations.append(" ");
+           textDecorations.append("line-through");
+       }
+       if (characterRun.isShadowed()) {
+           inline.setAttribute("text-shadow", characterRun.getFontSize() / 24
+                   + "pt");
+       }
+       if (characterRun.isSmallCaps()) {
+           inline.setAttribute("font-variant", "small-caps");
+       }
+       if (characterRun.getSubSuperScriptIndex() == 1) {
+           inline.setAttribute("baseline-shift", "super");
+           inline.setAttribute("font-size", "smaller");
+       }
+       if (characterRun.getSubSuperScriptIndex() == 2) {
+           inline.setAttribute("baseline-shift", "sub");
+           inline.setAttribute("font-size", "smaller");
+       }
+       if (characterRun.getUnderlineCode() > 0) {
+           if (textDecorations.length() > 0)
+               textDecorations.append(" ");
+           textDecorations.append("underline");
+       }
+       if (textDecorations.length() > 0) {
+           inline.setAttribute("text-decoration", textDecorations.toString());
+       }
+    }
+
+    public static void setFontFamily(final Element element,
+           final String fontFamily) {
+       element.setAttribute("font-family", fontFamily);
+    }
+
+    public static void setFontSize(final Element element, final int fontSize) {
+       element.setAttribute("font-size", String.valueOf(fontSize));
+    }
+
+    public static void setIndent(Paragraph paragraph, Element block) {
+       if (paragraph.getFirstLineIndent() != 0) {
+           block.setAttribute(
+                   "text-indent",
+                   String.valueOf(paragraph.getFirstLineIndent()
+                           / TWIPS_PER_PT)
+                           + "pt");
+       }
+       if (paragraph.getIndentFromLeft() != 0) {
+           block.setAttribute(
+                   "start-indent",
+                   String.valueOf(paragraph.getIndentFromLeft() / TWIPS_PER_PT)
+                           + "pt");
+       }
+       if (paragraph.getIndentFromRight() != 0) {
+           block.setAttribute(
+                   "end-indent",
+                   String.valueOf(paragraph.getIndentFromRight()
+                           / TWIPS_PER_PT)
+                           + "pt");
+       }
+       if (paragraph.getSpacingBefore() != 0) {
+           block.setAttribute("space-before",
+                   String.valueOf(paragraph.getSpacingBefore() / TWIPS_PER_PT)
+                           + "pt");
+       }
+       if (paragraph.getSpacingAfter() != 0) {
+           block.setAttribute("space-after",
+                   String.valueOf(paragraph.getSpacingAfter() / TWIPS_PER_PT)
+                           + "pt");
+       }
+    }
+
+    public static void setItalic(final Element element, final boolean italic) {
+       element.setAttribute("font-style", italic ? "italic" : "normal");
+    }
+
+    public static void setJustification(Paragraph paragraph,
+           final Element element) {
+       final int justification = paragraph.getJustification();
+       switch (justification) {
+       case 0:
+           element.setAttribute("text-align", "start");
+           break;
+       case 1:
+           element.setAttribute("text-align", "center");
+           break;
+       case 2:
+           element.setAttribute("text-align", "end");
+           break;
+       case 3:
+           element.setAttribute("text-align", "justify");
+           break;
+       case 4:
+           element.setAttribute("text-align", "justify");
+           break;
+       case 5:
+           element.setAttribute("text-align", "center");
+           break;
+       case 6:
+           element.setAttribute("text-align", "left");
+           break;
+       case 7:
+           element.setAttribute("text-align", "start");
+           break;
+       case 8:
+           element.setAttribute("text-align", "end");
+           break;
+       case 9:
+           element.setAttribute("text-align", "justify");
+           break;
+       }
+    }
+
+    public static void setParagraphProperties(Paragraph paragraph, Element block) {
+       setIndent(paragraph, block);
+       setJustification(paragraph, block);
+
+       setBorder(block, paragraph.getBottomBorder(), "bottom");
+       setBorder(block, paragraph.getLeftBorder(), "left");
+       setBorder(block, paragraph.getRightBorder(), "right");
+       setBorder(block, paragraph.getTopBorder(), "top");
+
+       if (paragraph.pageBreakBefore()) {
+           block.setAttribute("break-before", "page");
+       }
+
+       block.setAttribute("hyphenate",
+               String.valueOf(paragraph.isAutoHyphenated()));
+
+       if (paragraph.keepOnPage()) {
+           block.setAttribute("keep-together.within-page", "always");
+       }
+
+       if (paragraph.keepWithNext()) {
+           block.setAttribute("keep-with-next.within-page", "always");
+       }
+
+       block.setAttribute("linefeed-treatment", "preserve");
+       block.setAttribute("white-space-collapse", "false");
+    }
+
+    public static void setTableCellProperties(TableRow tableRow,
+           TableCell tableCell, Element element, boolean toppest,
+           boolean bottomest, boolean leftest, boolean rightest) {
+       element.setAttribute("width", (tableCell.getWidth() / TWIPS_PER_INCH)
+               + "in");
+       element.setAttribute("padding-start",
+               (tableRow.getGapHalf() / TWIPS_PER_INCH) + "in");
+       element.setAttribute("padding-end",
+               (tableRow.getGapHalf() / TWIPS_PER_INCH) + "in");
+
+       BorderCode top = tableCell.getBrcTop() != null ? tableCell.getBrcTop()
+               : toppest ? tableRow.getTopBorder() : tableRow
+                       .getHorizontalBorder();
+       BorderCode bottom = tableCell.getBrcBottom() != null ? tableCell
+               .getBrcBottom() : bottomest ? tableRow.getBottomBorder()
+               : tableRow.getHorizontalBorder();
+
+       BorderCode left = tableCell.getBrcLeft() != null ? tableCell
+               .getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
+               .getVerticalBorder();
+       BorderCode right = tableCell.getBrcRight() != null ? tableCell
+               .getBrcRight() : rightest ? tableRow.getRightBorder()
+               : tableRow.getVerticalBorder();
+
+       setBorder(element, bottom, "bottom");
+       setBorder(element, left, "left");
+       setBorder(element, right, "right");
+       setBorder(element, top, "top");
+    }
+
+    public static void setTableRowProperties(TableRow tableRow,
+           Element tableRowElement) {
+       if (tableRow.getRowHeight() > 0) {
+           tableRowElement.setAttribute("height",
+                   (tableRow.getRowHeight() / TWIPS_PER_INCH) + "in");
+       }
+       if (!tableRow.cantSplit()) {
+           tableRowElement.setAttribute("keep-together", "always");
+       }
+    }
+
+}