You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

FOText.java 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. /*
  2. * $Id: FOText.java,v 1.43 2003/03/05 21:48:01 jeremias Exp $
  3. * ============================================================================
  4. * The Apache Software License, Version 1.1
  5. * ============================================================================
  6. *
  7. * Copyright (C) 1999-2003 The Apache Software Foundation. All rights reserved.
  8. *
  9. * Redistribution and use in source and binary forms, with or without modifica-
  10. * tion, are permitted provided that the following conditions are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright notice,
  13. * this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright notice,
  16. * this list of conditions and the following disclaimer in the documentation
  17. * and/or other materials provided with the distribution.
  18. *
  19. * 3. The end-user documentation included with the redistribution, if any, must
  20. * include the following acknowledgment: "This product includes software
  21. * developed by the Apache Software Foundation (http://www.apache.org/)."
  22. * Alternately, this acknowledgment may appear in the software itself, if
  23. * and wherever such third-party acknowledgments normally appear.
  24. *
  25. * 4. The names "FOP" and "Apache Software Foundation" must not be used to
  26. * endorse or promote products derived from this software without prior
  27. * written permission. For written permission, please contact
  28. * apache@apache.org.
  29. *
  30. * 5. Products derived from this software may not be called "Apache", nor may
  31. * "Apache" appear in their name, without prior written permission of the
  32. * Apache Software Foundation.
  33. *
  34. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
  35. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  36. * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  37. * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  38. * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU-
  39. * DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  40. * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
  41. * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  42. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  43. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  44. * ============================================================================
  45. *
  46. * This software consists of voluntary contributions made by many individuals
  47. * on behalf of the Apache Software Foundation and was originally created by
  48. * James Tauber <jtauber@jtauber.com>. For more information on the Apache
  49. * Software Foundation, please see <http://www.apache.org/>.
  50. */
  51. package org.apache.fop.fo;
  52. // Java
  53. import java.util.NoSuchElementException;
  54. // FOP
  55. import org.apache.fop.fo.flow.Block;
  56. import org.apache.fop.fo.pagination.Root;
  57. /**
  58. * A text node in the formatting object tree.
  59. *
  60. * Unfortunately the BufferManager implementatation holds
  61. * onto references to the character data in this object
  62. * longer than the lifetime of the object itself, causing
  63. * excessive memory consumption and OOM errors.
  64. *
  65. * @author unascribed
  66. * @author <a href="mailto:mark-fop@inomial.com">Mark Lillywhite</a>
  67. */
  68. public class FOText extends FObj {
  69. /**
  70. * the character array containing the text
  71. */
  72. public char[] ca;
  73. /**
  74. * the length of the character array containing the text
  75. */
  76. public int length;
  77. /**
  78. * The TextInfo object attached to the text
  79. */
  80. public TextInfo textInfo;
  81. /**
  82. * Keeps track of the last FOText object created within the current
  83. * block. This is used to create pointers between such objects.
  84. * TODO: As soon as the control hierarchy is straightened out, this static
  85. * variable needs to become an instance variable in some parent object,
  86. * probably the page-sequence.
  87. */
  88. private static FOText lastFOTextProcessed = null;
  89. /**
  90. * Points to the previous FOText object created within the current
  91. * block. If this is "null", this is the first such object.
  92. */
  93. private FOText prevFOTextThisBlock = null;
  94. /**
  95. * Points to the next FOText object created within the current
  96. * block. If this is "null", this is the last such object.
  97. */
  98. private FOText nextFOTextThisBlock = null;
  99. /**
  100. * Points to the ancestor Block object. This is used to keep track of
  101. * which FOText nodes are descendants of the same block.
  102. */
  103. private Block ancestorBlock = null;
  104. private static final int IS_WORD_CHAR_FALSE = 0;
  105. private static final int IS_WORD_CHAR_TRUE = 1;
  106. private static final int IS_WORD_CHAR_MAYBE = 2;
  107. /**
  108. *
  109. * @param chars array of chars which contains the text in this object (may
  110. * be a superset of the text in this object
  111. * @param start starting index into char[] for the text in this object
  112. * @param end ending index into char[] for the text in this object
  113. * @param ti TextInfo object for the text in this object
  114. * @param parent FONode that is the parent of this object
  115. */
  116. public FOText(char[] chars, int start, int end, TextInfo ti, FONode parent) {
  117. super(parent);
  118. length = end - start;
  119. this.ca = new char[length];
  120. System.arraycopy(chars, start, ca, 0, length);
  121. textInfo = ti;
  122. createBlockPointers();
  123. textTransform();
  124. }
  125. /**
  126. * Check if this text node will create an area.
  127. * This means either there is non-whitespace or it is
  128. * preserved whitespace.
  129. * Maybe this just needs to check length > 0, since char iterators
  130. * handle whitespace.
  131. *
  132. * @return true if this will create an area in the output
  133. */
  134. public boolean willCreateArea() {
  135. if (textInfo.whiteSpaceCollapse == WhiteSpaceCollapse.FALSE
  136. && length > 0) {
  137. return true;
  138. }
  139. for (int i = 0; i < length; i++) {
  140. char ch = ca[i];
  141. if (!((ch == ' ')
  142. || (ch == '\n')
  143. || (ch == '\r')
  144. || (ch == '\t'))) { // whitespace
  145. return true;
  146. }
  147. }
  148. return false;
  149. }
  150. /**
  151. * @return a new TextCharIterator
  152. */
  153. public CharIterator charIterator() {
  154. return new TextCharIterator();
  155. }
  156. private class TextCharIterator extends AbstractCharIterator {
  157. private int curIndex = 0;
  158. public boolean hasNext() {
  159. return (curIndex < length);
  160. }
  161. public char nextChar() {
  162. if (curIndex < length) {
  163. // Just a char class? Don't actually care about the value!
  164. return ca[curIndex++];
  165. } else {
  166. throw new NoSuchElementException();
  167. }
  168. }
  169. public void remove() {
  170. if (curIndex > 0 && curIndex < length) {
  171. // copy from curIndex to end to curIndex-1
  172. System.arraycopy(ca, curIndex, ca, curIndex - 1,
  173. length - curIndex);
  174. length--;
  175. curIndex--;
  176. } else if (curIndex == length) {
  177. curIndex = --length;
  178. }
  179. }
  180. public void replaceChar(char c) {
  181. if (curIndex > 0 && curIndex <= length) {
  182. ca[curIndex - 1] = c;
  183. }
  184. }
  185. }
  186. /**
  187. * This method is run as part of the Constructor, to create xref pointers to
  188. * the previous FOText objects within the same Block
  189. */
  190. private void createBlockPointers() {
  191. // build pointers between the FOText objects withing the same Block
  192. //
  193. // find the ancestorBlock of the current node
  194. FONode ancestorFONode = this;
  195. while (this.ancestorBlock == null) {
  196. ancestorFONode = ancestorFONode.parent;
  197. Class myclass = ancestorFONode.getClass();
  198. if (ancestorFONode instanceof Root) {
  199. getLogger().warn("Unexpected: fo:text with no fo:block ancestor");
  200. }
  201. if (ancestorFONode instanceof Block) {
  202. this.ancestorBlock = (Block)ancestorFONode;
  203. }
  204. }
  205. // if the last FOText is a sibling, point to it, and have it point here
  206. if (lastFOTextProcessed != null) {
  207. if (lastFOTextProcessed.ancestorBlock == this.ancestorBlock) {
  208. prevFOTextThisBlock = lastFOTextProcessed;
  209. prevFOTextThisBlock.nextFOTextThisBlock = this;
  210. } else {
  211. prevFOTextThisBlock = null;
  212. }
  213. }
  214. // save the current node in static field so the next guy knows where
  215. // to look
  216. lastFOTextProcessed = this;
  217. return;
  218. }
  219. /**
  220. * This method is run as part of the Constructor, to handle the
  221. * text-transform property.
  222. */
  223. private void textTransform() {
  224. if (textInfo.textTransform == TextTransform.NONE) {
  225. return;
  226. }
  227. for (int i = 0; i < ca.length; i++) {
  228. ca[i] = charTransform(i);
  229. }
  230. }
  231. /**
  232. * Determines whether a particular location in an FOText object's text is
  233. * the start of a new "word". The use of "word" here is specifically for
  234. * the text-transform property, but may be useful for other things as
  235. * well, such as word-spacing. The definition of "word" is somewhat ambiguous
  236. * and appears to be definable by the user agent.
  237. *
  238. * @param i index into ca[]
  239. *
  240. * @return True if the character at this location is the start of a new
  241. * word.
  242. */
  243. public boolean isStartOfWord (int i) {
  244. char prevChar = getRelativeCharInBlock(i, -1);
  245. /* All we are really concerned about here is of what type prevChar
  246. is. If inputChar is not part of a word, then the Java
  247. conversions will (we hope) simply return inputChar.
  248. */
  249. switch (isWordChar(prevChar)) {
  250. case IS_WORD_CHAR_TRUE:
  251. return false;
  252. case IS_WORD_CHAR_FALSE:
  253. return true;
  254. /* "MAYBE" implies that additional context is needed. An example is a
  255. * single-quote, either straight or closing, which might be interpreted
  256. * as a possessive or a contraction, or might be a closing quote.
  257. */
  258. case IS_WORD_CHAR_MAYBE:
  259. char prevPrevChar = getRelativeCharInBlock(i, -2);
  260. switch (isWordChar(prevPrevChar)) {
  261. case IS_WORD_CHAR_TRUE:
  262. return false;
  263. case IS_WORD_CHAR_FALSE:
  264. return true;
  265. case IS_WORD_CHAR_MAYBE:
  266. return true;
  267. default:
  268. return false;
  269. }
  270. default:
  271. return false;
  272. }
  273. }
  274. /**
  275. * Finds a character within the current Block that is relative in location
  276. * to a character in the current FOText. Treats all FOText objects within a
  277. * block as one unit, allowing text in adjoining FOText objects to be
  278. * returned if the parameters are outside of the current object.
  279. *
  280. * @param i index into ca[]
  281. * @param offset signed integer with relative position within the
  282. * block of the character to return. To return the character immediately
  283. * preceding i, pass -1. To return the character immediately after i,
  284. * pass 1.
  285. * @return the character in the offset position within the block; \u0000 if
  286. * the offset points to an area outside of the block.
  287. */
  288. public char getRelativeCharInBlock(int i, int offset) {
  289. // The easy case is where the desired character is in the same FOText
  290. if (((i + offset) >= 0) && ((i + offset) <= this.length)) {
  291. return ca[i + offset];
  292. }
  293. // For now, we can't look at following FOText nodes
  294. if (offset > 0) {
  295. return '\u0000';
  296. }
  297. // Remaining case has the text in some previous FOText node
  298. boolean foundChar = false;
  299. char charToReturn = '\u0000';
  300. FOText nodeToTest = this;
  301. int remainingOffset = offset + i;
  302. while (!foundChar) {
  303. if (nodeToTest.prevFOTextThisBlock == null) {
  304. foundChar = true;
  305. break;
  306. }
  307. nodeToTest = nodeToTest.prevFOTextThisBlock;
  308. if ((nodeToTest.ca.length + remainingOffset) >= 0) {
  309. charToReturn = nodeToTest.ca[nodeToTest.ca.length + remainingOffset];
  310. foundChar = true;
  311. } else {
  312. remainingOffset = remainingOffset + nodeToTest.ca.length;
  313. }
  314. }
  315. return charToReturn;
  316. }
  317. /**
  318. * @return The previous FOText node in this Block; null, if this is the
  319. * first FOText in this Block.
  320. */
  321. public FOText getPrevFOTextThisBlock () {
  322. return prevFOTextThisBlock;
  323. }
  324. /**
  325. * @return The next FOText node in this Block; null if this is the last
  326. * FOText in this Block; null if subsequent FOText nodes have not yet been
  327. * processed.
  328. */
  329. public FOText getNextFOTextThisBlock () {
  330. return nextFOTextThisBlock;
  331. }
  332. /**
  333. * @return The nearest ancestor block object which contains this FOText.
  334. */
  335. public Block getAncestorBlock () {
  336. return ancestorBlock;
  337. }
  338. /**
  339. * Transforms one character in ca[] using the text-transform property.
  340. *
  341. * @param i the index into ca[]
  342. * @return char with transformed value
  343. */
  344. public char charTransform(int i) {
  345. switch (textInfo.textTransform) {
  346. /* put NONE first, as this is probably the common case */
  347. case TextTransform.NONE:
  348. return ca[i];
  349. case TextTransform.UPPERCASE:
  350. return Character.toUpperCase(ca[i]);
  351. case TextTransform.LOWERCASE:
  352. return Character.toLowerCase(ca[i]);
  353. case TextTransform.CAPITALIZE:
  354. if (isStartOfWord(i)) {
  355. /*
  356. Use toTitleCase here. Apparently, some languages use
  357. a different character to represent a letter when using
  358. initial caps than when all of the letters in the word
  359. are capitalized. We will try to let Java handle this.
  360. */
  361. return Character.toTitleCase(ca[i]);
  362. } else {
  363. return Character.toLowerCase(ca[i]);
  364. }
  365. default:
  366. getLogger().warn("Invalid text-tranform value: "
  367. + textInfo.textTransform);
  368. return ca[i];
  369. }
  370. }
  371. /**
  372. * Determines whether the input char should be considered part of a
  373. * "word". This is used primarily to determine whether the character
  374. * immediately following starts a new word, but may have other uses.
  375. * We have not found a definition of "word" in the standard (1.0), so the
  376. * logic used here is based on the programmer's best guess.
  377. *
  378. * @param inputChar the character to be tested.
  379. * @return int IS_WORD_CHAR_TRUE, IS_WORD_CHAR_FALSE, or IS_WORD_CHAR_MAYBE,
  380. * depending on whether the character should be considered part of a word
  381. * or not.
  382. */
  383. public static int isWordChar(char inputChar) {
  384. switch (Character.getType(inputChar)) {
  385. case Character.COMBINING_SPACING_MARK:
  386. return IS_WORD_CHAR_TRUE;
  387. case Character.CONNECTOR_PUNCTUATION:
  388. return IS_WORD_CHAR_TRUE;
  389. case Character.CONTROL:
  390. return IS_WORD_CHAR_FALSE;
  391. case Character.CURRENCY_SYMBOL:
  392. return IS_WORD_CHAR_TRUE;
  393. case Character.DASH_PUNCTUATION:
  394. if (inputChar == '-') {
  395. return IS_WORD_CHAR_TRUE; //hyphen
  396. }
  397. return IS_WORD_CHAR_FALSE;
  398. case Character.DECIMAL_DIGIT_NUMBER:
  399. return IS_WORD_CHAR_TRUE;
  400. case Character.ENCLOSING_MARK:
  401. return IS_WORD_CHAR_FALSE;
  402. case Character.END_PUNCTUATION:
  403. if (inputChar == '\u2019') {
  404. return IS_WORD_CHAR_MAYBE; //apostrophe, right single quote
  405. }
  406. return IS_WORD_CHAR_FALSE;
  407. case Character.FORMAT:
  408. return IS_WORD_CHAR_FALSE;
  409. case Character.LETTER_NUMBER:
  410. return IS_WORD_CHAR_TRUE;
  411. case Character.LINE_SEPARATOR:
  412. return IS_WORD_CHAR_FALSE;
  413. case Character.LOWERCASE_LETTER:
  414. return IS_WORD_CHAR_TRUE;
  415. case Character.MATH_SYMBOL:
  416. return IS_WORD_CHAR_FALSE;
  417. case Character.MODIFIER_LETTER:
  418. return IS_WORD_CHAR_TRUE;
  419. case Character.MODIFIER_SYMBOL:
  420. return IS_WORD_CHAR_TRUE;
  421. case Character.NON_SPACING_MARK:
  422. return IS_WORD_CHAR_TRUE;
  423. case Character.OTHER_LETTER:
  424. return IS_WORD_CHAR_TRUE;
  425. case Character.OTHER_NUMBER:
  426. return IS_WORD_CHAR_TRUE;
  427. case Character.OTHER_PUNCTUATION:
  428. if (inputChar == '\'') {
  429. return IS_WORD_CHAR_MAYBE; //ASCII apostrophe
  430. }
  431. return IS_WORD_CHAR_FALSE;
  432. case Character.OTHER_SYMBOL:
  433. return IS_WORD_CHAR_TRUE;
  434. case Character.PARAGRAPH_SEPARATOR:
  435. return IS_WORD_CHAR_FALSE;
  436. case Character.PRIVATE_USE:
  437. return IS_WORD_CHAR_FALSE;
  438. case Character.SPACE_SEPARATOR:
  439. return IS_WORD_CHAR_FALSE;
  440. case Character.START_PUNCTUATION:
  441. return IS_WORD_CHAR_FALSE;
  442. case Character.SURROGATE:
  443. return IS_WORD_CHAR_FALSE;
  444. case Character.TITLECASE_LETTER:
  445. return IS_WORD_CHAR_TRUE;
  446. case Character.UNASSIGNED:
  447. return IS_WORD_CHAR_FALSE;
  448. case Character.UPPERCASE_LETTER:
  449. return IS_WORD_CHAR_TRUE;
  450. default:
  451. return IS_WORD_CHAR_FALSE;
  452. }
  453. }
  454. /**
  455. * This is a hook for an FOTreeVisitor subclass to be able to access
  456. * this object.
  457. * @param fotv the FOTreeVisitor subclass that can access this object.
  458. * @see org.apache.fop.fo.FOTreeVisitor
  459. */
  460. public void acceptVisitor(FOTreeVisitor fotv) {
  461. fotv.serveFOText(this);
  462. }
  463. }