diff options
Diffstat (limited to 'server/src/org/jsoup/nodes/Entities.java')
-rw-r--r-- | server/src/org/jsoup/nodes/Entities.java | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/server/src/org/jsoup/nodes/Entities.java b/server/src/org/jsoup/nodes/Entities.java new file mode 100644 index 0000000000..0ae83e1fc0 --- /dev/null +++ b/server/src/org/jsoup/nodes/Entities.java @@ -0,0 +1,184 @@ +package org.jsoup.nodes; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.CharsetEncoder; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * HTML entities, and escape routines. + * Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML + * named character references</a>. + */ +public class Entities { + public enum EscapeMode { + /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */ + xhtml(xhtmlByVal), + /** Default HTML output entities. */ + base(baseByVal), + /** Complete HTML entities. */ + extended(fullByVal); + + private Map<Character, String> map; + + EscapeMode(Map<Character, String> map) { + this.map = map; + } + + public Map<Character, String> getMap() { + return map; + } + } + + private static final Map<String, Character> full; + private static final Map<Character, String> xhtmlByVal; + private static final Map<Character, String> baseByVal; + private static final Map<Character, String> fullByVal; + private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); + private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); + + private Entities() {} + + /** + * Check if the input is a known named entity + * @param name the possible entity name (e.g. "lt" or "amp" + * @return true if a known named entity + */ + public static boolean isNamedEntity(String name) { + return full.containsKey(name); + } + + /** + * Get the Character value of the named entity + * @param name named entity (e.g. "lt" or "amp") + * @return the Character value of the named entity (e.g. '<' or '&') + */ + public static Character getCharacterByName(String name) { + return full.get(name); + } + + static String escape(String string, Document.OutputSettings out) { + return escape(string, out.encoder(), out.escapeMode()); + } + + static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) { + StringBuilder accum = new StringBuilder(string.length() * 2); + Map<Character, String> map = escapeMode.getMap(); + + for (int pos = 0; pos < string.length(); pos++) { + Character c = string.charAt(pos); + if (map.containsKey(c)) + accum.append('&').append(map.get(c)).append(';'); + else if (encoder.canEncode(c)) + accum.append(c.charValue()); + else + accum.append("&#").append((int) c).append(';'); + } + + return accum.toString(); + } + + static String unescape(String string) { + return unescape(string, false); + } + + /** + * Unescape the input string. + * @param string + * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) + * @return + */ + static String unescape(String string, boolean strict) { + // todo: change this method to use Tokeniser.consumeCharacterReference + if (!string.contains("&")) + return string; + + Matcher m = strict? strictUnescapePattern.matcher(string) : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? + StringBuffer accum = new StringBuffer(string.length()); // pity matcher can't use stringbuilder, avoid syncs + // todo: replace m.appendReplacement with own impl, so StringBuilder and quoteReplacement not required + + while (m.find()) { + int charval = -1; + String num = m.group(3); + if (num != null) { + try { + int base = m.group(2) != null ? 16 : 10; // 2 is hex indicator + charval = Integer.valueOf(num, base); + } catch (NumberFormatException e) { + } // skip + } else { + String name = m.group(1); + if (full.containsKey(name)) + charval = full.get(name); + } + + if (charval != -1 || charval > 0xFFFF) { // out of range + String c = Character.toString((char) charval); + m.appendReplacement(accum, Matcher.quoteReplacement(c)); + } else { + m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace with original string + } + } + m.appendTail(accum); + return accum.toString(); + } + + // xhtml has restricted entities + private static final Object[][] xhtmlArray = { + {"quot", 0x00022}, + {"amp", 0x00026}, + {"apos", 0x00027}, + {"lt", 0x0003C}, + {"gt", 0x0003E} + }; + + static { + xhtmlByVal = new HashMap<Character, String>(); + baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most common / default + full = loadEntities("entities-full.properties"); // extended and overblown. + fullByVal = toCharacterKey(full); + + for (Object[] entity : xhtmlArray) { + Character c = Character.valueOf((char) ((Integer) entity[1]).intValue()); + xhtmlByVal.put(c, ((String) entity[0])); + } + } + + private static Map<String, Character> loadEntities(String filename) { + Properties properties = new Properties(); + Map<String, Character> entities = new HashMap<String, Character>(); + try { + InputStream in = Entities.class.getResourceAsStream(filename); + properties.load(in); + in.close(); + } catch (IOException e) { + throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename); + } + + for (Map.Entry entry: properties.entrySet()) { + Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16)); + String name = (String) entry.getKey(); + entities.put(name, val); + } + return entities; + } + + private static Map<Character, String> toCharacterKey(Map<String, Character> inMap) { + Map<Character, String> outMap = new HashMap<Character, String>(); + for (Map.Entry<String, Character> entry: inMap.entrySet()) { + Character character = entry.getValue(); + String name = entry.getKey(); + + if (outMap.containsKey(character)) { + // dupe, prefer the lower case version + if (name.toLowerCase().equals(name)) + outMap.put(character, name); + } else { + outMap.put(character, name); + } + } + return outMap; + } +} |