package org.jsoup.nodes; import java.io.IOException; import java.io.InputStream; import java.nio.charset.CharsetEncoder; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * HTML entities, and escape routines. * Source: W3C HTML * named character references. */ public class Entities { public enum EscapeMode { /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */ xhtml(xhtmlByVal), /** Default HTML output entities. */ base(baseByVal), /** Complete HTML entities. */ extended(fullByVal); private Map map; EscapeMode(Map map) { this.map = map; } public Map getMap() { return map; } } private static final Map full; private static final Map xhtmlByVal; private static final Map baseByVal; private static final Map fullByVal; private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); private Entities() {} /** * Check if the input is a known named entity * @param name the possible entity name (e.g. "lt" or "amp" * @return true if a known named entity */ public static boolean isNamedEntity(String name) { return full.containsKey(name); } /** * Get the Character value of the named entity * @param name named entity (e.g. "lt" or "amp") * @return the Character value of the named entity (e.g. '<' or '&') */ public static Character getCharacterByName(String name) { return full.get(name); } static String escape(String string, Document.OutputSettings out) { return escape(string, out.encoder(), out.escapeMode()); } static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) { StringBuilder accum = new StringBuilder(string.length() * 2); Map map = escapeMode.getMap(); for (int pos = 0; pos < string.length(); pos++) { Character c = string.charAt(pos); if (map.containsKey(c)) accum.append('&').append(map.get(c)).append(';'); else if (encoder.canEncode(c)) accum.append(c.charValue()); else accum.append("&#").append((int) c).append(';'); } return accum.toString(); } static String unescape(String string) { return unescape(string, false); } /** * Unescape the input string. * @param string * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) * @return */ static String unescape(String string, boolean strict) { // todo: change this method to use Tokeniser.consumeCharacterReference if (!string.contains("&")) return string; Matcher m = strict? strictUnescapePattern.matcher(string) : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? StringBuffer accum = new StringBuffer(string.length()); // pity matcher can't use stringbuilder, avoid syncs // todo: replace m.appendReplacement with own impl, so StringBuilder and quoteReplacement not required while (m.find()) { int charval = -1; String num = m.group(3); if (num != null) { try { int base = m.group(2) != null ? 16 : 10; // 2 is hex indicator charval = Integer.valueOf(num, base); } catch (NumberFormatException e) { } // skip } else { String name = m.group(1); if (full.containsKey(name)) charval = full.get(name); } if (charval != -1 || charval > 0xFFFF) { // out of range String c = Character.toString((char) charval); m.appendReplacement(accum, Matcher.quoteReplacement(c)); } else { m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace with original string } } m.appendTail(accum); return accum.toString(); } // xhtml has restricted entities private static final Object[][] xhtmlArray = { {"quot", 0x00022}, {"amp", 0x00026}, {"apos", 0x00027}, {"lt", 0x0003C}, {"gt", 0x0003E} }; static { xhtmlByVal = new HashMap(); baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most common / default full = loadEntities("entities-full.properties"); // extended and overblown. fullByVal = toCharacterKey(full); for (Object[] entity : xhtmlArray) { Character c = Character.valueOf((char) ((Integer) entity[1]).intValue()); xhtmlByVal.put(c, ((String) entity[0])); } } private static Map loadEntities(String filename) { Properties properties = new Properties(); Map entities = new HashMap(); try { InputStream in = Entities.class.getResourceAsStream(filename); properties.load(in); in.close(); } catch (IOException e) { throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename); } for (Map.Entry entry: properties.entrySet()) { Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16)); String name = (String) entry.getKey(); entities.put(name, val); } return entities; } private static Map toCharacterKey(Map inMap) { Map outMap = new HashMap(); for (Map.Entry entry: inMap.entrySet()) { Character character = entry.getValue(); String name = entry.getKey(); if (outMap.containsKey(character)) { // dupe, prefer the lower case version if (name.toLowerCase().equals(name)) outMap.put(character, name); } else { outMap.put(character, name); } } return outMap; } }