You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Word6Extractor.java 4.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.extractor;
  16. import java.io.IOException;
  17. import java.io.InputStream;
  18. import org.apache.poi.extractor.POIOLE2TextExtractor;
  19. import org.apache.poi.hwpf.HWPFOldDocument;
  20. import org.apache.poi.hwpf.converter.WordToTextConverter;
  21. import org.apache.poi.hwpf.usermodel.Range;
  22. import org.apache.poi.poifs.filesystem.DirectoryNode;
  23. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  24. /**
  25. * Class to extract the text from old (Word 6 / Word 95) Word Documents.
  26. *
  27. * This should only be used on the older files, for most uses you
  28. * should call {@link WordExtractor} which deals properly
  29. * with HWPF.
  30. *
  31. * @author Nick Burch
  32. */
  33. public final class Word6Extractor implements POIOLE2TextExtractor {
  34. private HWPFOldDocument doc;
  35. private boolean doCloseFilesystem = true;
  36. /**
  37. * Create a new Word Extractor
  38. * @param is InputStream containing the word file
  39. */
  40. public Word6Extractor(InputStream is) throws IOException {
  41. this( new POIFSFileSystem(is) );
  42. }
  43. /**
  44. * Create a new Word Extractor
  45. *
  46. * @param fs
  47. * POIFSFileSystem containing the word file
  48. */
  49. public Word6Extractor( POIFSFileSystem fs ) throws IOException {
  50. this( fs.getRoot() );
  51. }
  52. /**
  53. * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
  54. */
  55. @Deprecated
  56. public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) throws IOException {
  57. this( dir );
  58. }
  59. public Word6Extractor( DirectoryNode dir ) throws IOException {
  60. this( new HWPFOldDocument( dir ) );
  61. }
  62. /**
  63. * Create a new Word Extractor
  64. * @param doc The HWPFOldDocument to extract from
  65. */
  66. public Word6Extractor(HWPFOldDocument doc) {
  67. this.doc = doc;
  68. }
  69. /**
  70. * Get the text from the word file, as an array with one String
  71. * per paragraph
  72. */
  73. @Deprecated
  74. public String[] getParagraphText() {
  75. String[] ret;
  76. // Extract using the model code
  77. try {
  78. Range r = doc.getRange();
  79. ret = WordExtractor.getParagraphText(r);
  80. } catch (Exception e) {
  81. // Something's up with turning the text pieces into paragraphs
  82. // Fall back to ripping out the text pieces
  83. ret = new String[doc.getTextTable().getTextPieces().size()];
  84. for(int i=0; i<ret.length; i++) {
  85. ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuilder().toString();
  86. // Fix the line endings
  87. ret[i] = ret[i].replaceAll("\r", "\ufffe");
  88. ret[i] = ret[i].replaceAll("\ufffe","\r\n");
  89. }
  90. }
  91. return ret;
  92. }
  93. public String getText() {
  94. try {
  95. WordToTextConverter wordToTextConverter = new WordToTextConverter();
  96. wordToTextConverter.processDocument( doc );
  97. return wordToTextConverter.getText();
  98. } catch ( Exception exc ) {
  99. // fall-back
  100. StringBuilder text = new StringBuilder();
  101. for ( String t : getParagraphText() ) {
  102. text.append( t );
  103. }
  104. return text.toString();
  105. }
  106. }
  107. @Override
  108. public HWPFOldDocument getDocument() {
  109. return doc;
  110. }
  111. @Override
  112. public void setCloseFilesystem(boolean doCloseFilesystem) {
  113. this.doCloseFilesystem = doCloseFilesystem;
  114. }
  115. @Override
  116. public boolean isCloseFilesystem() {
  117. return doCloseFilesystem;
  118. }
  119. @Override
  120. public HWPFOldDocument getFilesystem() {
  121. return doc;
  122. }
  123. }