package org.jsoup.nodes;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.CharsetEncoder;
import java.util.HashMap;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* HTML entities, and escape routines. Source: W3C HTML named character references.
*/
public class Entities {
public enum EscapeMode {
/**
* Restricted entities suitable for XHTML output: lt, gt, amp, apos, and
* quot only.
*/
xhtml(xhtmlByVal),
/** Default HTML output entities. */
base(baseByVal),
/** Complete HTML entities. */
extended(fullByVal);
private Map map;
EscapeMode(Map map) {
this.map = map;
}
public Map getMap() {
return map;
}
}
private static final Map full;
private static final Map xhtmlByVal;
private static final Map baseByVal;
private static final Map fullByVal;
private static final Pattern unescapePattern = Pattern
.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?");
private static final Pattern strictUnescapePattern = Pattern
.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);");
private Entities() {
}
/**
* Check if the input is a known named entity
*
* @param name
* the possible entity name (e.g. "lt" or "amp"
* @return true if a known named entity
*/
public static boolean isNamedEntity(String name) {
return full.containsKey(name);
}
/**
* Get the Character value of the named entity
* @param name named entity (e.g. "lt" or "amp")
* @return the Character value of the named entity (e.g. '<' or '&')
*/
public static Character getCharacterByName(String name) {
return full.get(name);
}
static String escape(String string, Document.OutputSettings out) {
return escape(string, out.encoder(), out.escapeMode());
}
static String escape(String string, CharsetEncoder encoder,
EscapeMode escapeMode) {
StringBuilder accum = new StringBuilder(string.length() * 2);
Map map = escapeMode.getMap();
for (int pos = 0; pos < string.length(); pos++) {
Character c = string.charAt(pos);
if (map.containsKey(c)) {
accum.append('&').append(map.get(c)).append(';');
} else if (encoder.canEncode(c)) {
accum.append(c.charValue());
} else {
accum.append("").append((int) c).append(';');
}
}
return accum.toString();
}
static String unescape(String string) {
return unescape(string, false);
}
/**
* Unescape the input string.
*
* @param string
* @param strict
* if "strict" (that is, requires trailing ';' char, otherwise
* that's optional)
* @return
*/
static String unescape(String string, boolean strict) {
// todo: change this method to use Tokeniser.consumeCharacterReference
if (!string.contains("&")) {
return string;
}
Matcher m = strict ? strictUnescapePattern.matcher(string)
: unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);?
StringBuffer accum = new StringBuffer(string.length()); // pity matcher
// can't use
// stringbuilder,
// avoid syncs
// todo: replace m.appendReplacement with own impl, so StringBuilder and
// quoteReplacement not required
while (m.find()) {
int charval = -1;
String num = m.group(3);
if (num != null) {
try {
int base = m.group(2) != null ? 16 : 10; // 2 is hex
// indicator
charval = Integer.valueOf(num, base);
} catch (NumberFormatException e) {
} // skip
} else {
String name = m.group(1);
if (full.containsKey(name)) {
charval = full.get(name);
}
}
if (charval != -1 || charval > 0xFFFF) { // out of range
String c = Character.toString((char) charval);
m.appendReplacement(accum, Matcher.quoteReplacement(c));
} else {
m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace
// with
// original
// string
}
}
m.appendTail(accum);
return accum.toString();
}
// xhtml has restricted entities
private static final Object[][] xhtmlArray = { { "quot", 0x00022 },
{ "amp", 0x00026 }, { "apos", 0x00027 }, { "lt", 0x0003C },
{ "gt", 0x0003E } };
static {
xhtmlByVal = new HashMap();
baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most
// common
// /
// default
full = loadEntities("entities-full.properties"); // extended and
// overblown.
fullByVal = toCharacterKey(full);
for (Object[] entity : xhtmlArray) {
Character c = Character.valueOf((char) ((Integer) entity[1])
.intValue());
xhtmlByVal.put(c, ((String) entity[0]));
}
}
private static Map loadEntities(String filename) {
Properties properties = new Properties();
Map entities = new HashMap();
try {
InputStream in = Entities.class.getResourceAsStream(filename);
properties.load(in);
in.close();
} catch (IOException e) {
throw new MissingResourceException(
"Error loading entities resource: " + e.getMessage(),
"Entities", filename);
}
for (Map.Entry entry : properties.entrySet()) {
Character val = Character.valueOf((char) Integer.parseInt(
(String) entry.getValue(), 16));
String name = (String) entry.getKey();
entities.put(name, val);
}
return entities;
}
private static Map toCharacterKey(
Map inMap) {
Map outMap = new HashMap();
for (Map.Entry entry : inMap.entrySet()) {
Character character = entry.getValue();
String name = entry.getKey();
if (outMap.containsKey(character)) {
// dupe, prefer the lower case version
if (name.toLowerCase().equals(name)) {
outMap.put(character, name);
}
} else {
outMap.put(character, name);
}
}
return outMap;
}
}