diff options
Diffstat (limited to 'server/src/org/jsoup/nodes/Entities.java')
-rw-r--r-- | server/src/org/jsoup/nodes/Entities.java | 111 |
1 files changed, 72 insertions, 39 deletions
diff --git a/server/src/org/jsoup/nodes/Entities.java b/server/src/org/jsoup/nodes/Entities.java index 0ae83e1fc0..24b50d7344 100644 --- a/server/src/org/jsoup/nodes/Entities.java +++ b/server/src/org/jsoup/nodes/Entities.java @@ -3,18 +3,24 @@ package org.jsoup.nodes; import java.io.IOException; import java.io.InputStream; import java.nio.charset.CharsetEncoder; -import java.util.*; +import java.util.HashMap; +import java.util.Map; +import java.util.MissingResourceException; +import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * HTML entities, and escape routines. - * Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML - * named character references</a>. + * HTML entities, and escape routines. Source: <a href= + * "http://www.w3.org/TR/html5/named-character-references.html#named-character-references" + * >W3C HTML named character references</a>. */ public class Entities { public enum EscapeMode { - /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */ + /** + * Restricted entities suitable for XHTML output: lt, gt, amp, apos, and + * quot only. + */ xhtml(xhtmlByVal), /** Default HTML output entities. */ base(baseByVal), @@ -36,21 +42,26 @@ public class Entities { private static final Map<Character, String> xhtmlByVal; private static final Map<Character, String> baseByVal; private static final Map<Character, String> fullByVal; - private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); - private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); + private static final Pattern unescapePattern = Pattern + .compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); + private static final Pattern strictUnescapePattern = Pattern + .compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); - private Entities() {} + private Entities() { + } /** * Check if the input is a known named entity - * @param name the possible entity name (e.g. "lt" or "amp" + * + * @param name + * the possible entity name (e.g. "lt" or "amp" * @return true if a known named entity */ public static boolean isNamedEntity(String name) { return full.containsKey(name); } - /** +/** * Get the Character value of the named entity * @param name named entity (e.g. "lt" or "amp") * @return the Character value of the named entity (e.g. '<' or '&') @@ -58,23 +69,25 @@ public class Entities { public static Character getCharacterByName(String name) { return full.get(name); } - + static String escape(String string, Document.OutputSettings out) { return escape(string, out.encoder(), out.escapeMode()); } - static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) { + static String escape(String string, CharsetEncoder encoder, + EscapeMode escapeMode) { StringBuilder accum = new StringBuilder(string.length() * 2); Map<Character, String> map = escapeMode.getMap(); for (int pos = 0; pos < string.length(); pos++) { Character c = string.charAt(pos); - if (map.containsKey(c)) + if (map.containsKey(c)) { accum.append('&').append(map.get(c)).append(';'); - else if (encoder.canEncode(c)) + } else if (encoder.canEncode(c)) { accum.append(c.charValue()); - else + } else { accum.append("&#").append((int) c).append(';'); + } } return accum.toString(); @@ -86,39 +99,53 @@ public class Entities { /** * Unescape the input string. + * * @param string - * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) + * @param strict + * if "strict" (that is, requires trailing ';' char, otherwise + * that's optional) * @return */ static String unescape(String string, boolean strict) { // todo: change this method to use Tokeniser.consumeCharacterReference - if (!string.contains("&")) + if (!string.contains("&")) { return string; + } - Matcher m = strict? strictUnescapePattern.matcher(string) : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? - StringBuffer accum = new StringBuffer(string.length()); // pity matcher can't use stringbuilder, avoid syncs - // todo: replace m.appendReplacement with own impl, so StringBuilder and quoteReplacement not required + Matcher m = strict ? strictUnescapePattern.matcher(string) + : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? + StringBuffer accum = new StringBuffer(string.length()); // pity matcher + // can't use + // stringbuilder, + // avoid syncs + // todo: replace m.appendReplacement with own impl, so StringBuilder and + // quoteReplacement not required while (m.find()) { int charval = -1; String num = m.group(3); if (num != null) { try { - int base = m.group(2) != null ? 16 : 10; // 2 is hex indicator + int base = m.group(2) != null ? 16 : 10; // 2 is hex + // indicator charval = Integer.valueOf(num, base); } catch (NumberFormatException e) { } // skip } else { String name = m.group(1); - if (full.containsKey(name)) + if (full.containsKey(name)) { charval = full.get(name); + } } if (charval != -1 || charval > 0xFFFF) { // out of range String c = Character.toString((char) charval); m.appendReplacement(accum, Matcher.quoteReplacement(c)); } else { - m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace with original string + m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace + // with + // original + // string } } m.appendTail(accum); @@ -126,22 +153,23 @@ public class Entities { } // xhtml has restricted entities - private static final Object[][] xhtmlArray = { - {"quot", 0x00022}, - {"amp", 0x00026}, - {"apos", 0x00027}, - {"lt", 0x0003C}, - {"gt", 0x0003E} - }; + private static final Object[][] xhtmlArray = { { "quot", 0x00022 }, + { "amp", 0x00026 }, { "apos", 0x00027 }, { "lt", 0x0003C }, + { "gt", 0x0003E } }; static { xhtmlByVal = new HashMap<Character, String>(); - baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most common / default - full = loadEntities("entities-full.properties"); // extended and overblown. + baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most + // common + // / + // default + full = loadEntities("entities-full.properties"); // extended and + // overblown. fullByVal = toCharacterKey(full); for (Object[] entity : xhtmlArray) { - Character c = Character.valueOf((char) ((Integer) entity[1]).intValue()); + Character c = Character.valueOf((char) ((Integer) entity[1]) + .intValue()); xhtmlByVal.put(c, ((String) entity[0])); } } @@ -154,27 +182,32 @@ public class Entities { properties.load(in); in.close(); } catch (IOException e) { - throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename); + throw new MissingResourceException( + "Error loading entities resource: " + e.getMessage(), + "Entities", filename); } - for (Map.Entry entry: properties.entrySet()) { - Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16)); + for (Map.Entry entry : properties.entrySet()) { + Character val = Character.valueOf((char) Integer.parseInt( + (String) entry.getValue(), 16)); String name = (String) entry.getKey(); entities.put(name, val); } return entities; } - private static Map<Character, String> toCharacterKey(Map<String, Character> inMap) { + private static Map<Character, String> toCharacterKey( + Map<String, Character> inMap) { Map<Character, String> outMap = new HashMap<Character, String>(); - for (Map.Entry<String, Character> entry: inMap.entrySet()) { + for (Map.Entry<String, Character> entry : inMap.entrySet()) { Character character = entry.getValue(); String name = entry.getKey(); if (outMap.containsKey(character)) { // dupe, prefer the lower case version - if (name.toLowerCase().equals(name)) + if (name.toLowerCase().equals(name)) { outMap.put(character, name); + } } else { outMap.put(character, name); } |