summaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/nodes/Entities.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/nodes/Entities.java')
-rw-r--r--server/src/org/jsoup/nodes/Entities.java184
1 files changed, 184 insertions, 0 deletions
diff --git a/server/src/org/jsoup/nodes/Entities.java b/server/src/org/jsoup/nodes/Entities.java
new file mode 100644
index 0000000000..0ae83e1fc0
--- /dev/null
+++ b/server/src/org/jsoup/nodes/Entities.java
@@ -0,0 +1,184 @@
+package org.jsoup.nodes;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.CharsetEncoder;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * HTML entities, and escape routines.
+ * Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML
+ * named character references</a>.
+ */
+public class Entities {
+ public enum EscapeMode {
+ /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */
+ xhtml(xhtmlByVal),
+ /** Default HTML output entities. */
+ base(baseByVal),
+ /** Complete HTML entities. */
+ extended(fullByVal);
+
+ private Map<Character, String> map;
+
+ EscapeMode(Map<Character, String> map) {
+ this.map = map;
+ }
+
+ public Map<Character, String> getMap() {
+ return map;
+ }
+ }
+
+ private static final Map<String, Character> full;
+ private static final Map<Character, String> xhtmlByVal;
+ private static final Map<Character, String> baseByVal;
+ private static final Map<Character, String> fullByVal;
+ private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?");
+ private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);");
+
+ private Entities() {}
+
+ /**
+ * Check if the input is a known named entity
+ * @param name the possible entity name (e.g. "lt" or "amp"
+ * @return true if a known named entity
+ */
+ public static boolean isNamedEntity(String name) {
+ return full.containsKey(name);
+ }
+
+ /**
+ * Get the Character value of the named entity
+ * @param name named entity (e.g. "lt" or "amp")
+ * @return the Character value of the named entity (e.g. '<' or '&')
+ */
+ public static Character getCharacterByName(String name) {
+ return full.get(name);
+ }
+
+ static String escape(String string, Document.OutputSettings out) {
+ return escape(string, out.encoder(), out.escapeMode());
+ }
+
+ static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) {
+ StringBuilder accum = new StringBuilder(string.length() * 2);
+ Map<Character, String> map = escapeMode.getMap();
+
+ for (int pos = 0; pos < string.length(); pos++) {
+ Character c = string.charAt(pos);
+ if (map.containsKey(c))
+ accum.append('&').append(map.get(c)).append(';');
+ else if (encoder.canEncode(c))
+ accum.append(c.charValue());
+ else
+ accum.append("&#").append((int) c).append(';');
+ }
+
+ return accum.toString();
+ }
+
+ static String unescape(String string) {
+ return unescape(string, false);
+ }
+
+ /**
+ * Unescape the input string.
+ * @param string
+ * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
+ * @return
+ */
+ static String unescape(String string, boolean strict) {
+ // todo: change this method to use Tokeniser.consumeCharacterReference
+ if (!string.contains("&"))
+ return string;
+
+ Matcher m = strict? strictUnescapePattern.matcher(string) : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);?
+ StringBuffer accum = new StringBuffer(string.length()); // pity matcher can't use stringbuilder, avoid syncs
+ // todo: replace m.appendReplacement with own impl, so StringBuilder and quoteReplacement not required
+
+ while (m.find()) {
+ int charval = -1;
+ String num = m.group(3);
+ if (num != null) {
+ try {
+ int base = m.group(2) != null ? 16 : 10; // 2 is hex indicator
+ charval = Integer.valueOf(num, base);
+ } catch (NumberFormatException e) {
+ } // skip
+ } else {
+ String name = m.group(1);
+ if (full.containsKey(name))
+ charval = full.get(name);
+ }
+
+ if (charval != -1 || charval > 0xFFFF) { // out of range
+ String c = Character.toString((char) charval);
+ m.appendReplacement(accum, Matcher.quoteReplacement(c));
+ } else {
+ m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace with original string
+ }
+ }
+ m.appendTail(accum);
+ return accum.toString();
+ }
+
+ // xhtml has restricted entities
+ private static final Object[][] xhtmlArray = {
+ {"quot", 0x00022},
+ {"amp", 0x00026},
+ {"apos", 0x00027},
+ {"lt", 0x0003C},
+ {"gt", 0x0003E}
+ };
+
+ static {
+ xhtmlByVal = new HashMap<Character, String>();
+ baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most common / default
+ full = loadEntities("entities-full.properties"); // extended and overblown.
+ fullByVal = toCharacterKey(full);
+
+ for (Object[] entity : xhtmlArray) {
+ Character c = Character.valueOf((char) ((Integer) entity[1]).intValue());
+ xhtmlByVal.put(c, ((String) entity[0]));
+ }
+ }
+
+ private static Map<String, Character> loadEntities(String filename) {
+ Properties properties = new Properties();
+ Map<String, Character> entities = new HashMap<String, Character>();
+ try {
+ InputStream in = Entities.class.getResourceAsStream(filename);
+ properties.load(in);
+ in.close();
+ } catch (IOException e) {
+ throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename);
+ }
+
+ for (Map.Entry entry: properties.entrySet()) {
+ Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16));
+ String name = (String) entry.getKey();
+ entities.put(name, val);
+ }
+ return entities;
+ }
+
+ private static Map<Character, String> toCharacterKey(Map<String, Character> inMap) {
+ Map<Character, String> outMap = new HashMap<Character, String>();
+ for (Map.Entry<String, Character> entry: inMap.entrySet()) {
+ Character character = entry.getValue();
+ String name = entry.getKey();
+
+ if (outMap.containsKey(character)) {
+ // dupe, prefer the lower case version
+ if (name.toLowerCase().equals(name))
+ outMap.put(character, name);
+ } else {
+ outMap.put(character, name);
+ }
+ }
+ return outMap;
+ }
+}