diff options
Diffstat (limited to 'server/src/org/jsoup/examples/HtmlToPlainText.java')
-rw-r--r-- | server/src/org/jsoup/examples/HtmlToPlainText.java | 128 |
1 files changed, 0 insertions, 128 deletions
diff --git a/server/src/org/jsoup/examples/HtmlToPlainText.java b/server/src/org/jsoup/examples/HtmlToPlainText.java deleted file mode 100644 index 53e485be34..0000000000 --- a/server/src/org/jsoup/examples/HtmlToPlainText.java +++ /dev/null @@ -1,128 +0,0 @@ -package org.jsoup.examples; - -import java.io.IOException; - -import org.jsoup.Jsoup; -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; -import org.jsoup.select.NodeTraversor; -import org.jsoup.select.NodeVisitor; - -/** - * HTML to plain-text. This example program demonstrates the use of jsoup to - * convert HTML input to lightly-formatted plain-text. That is divergent from - * the general goal of jsoup's .text() methods, which is to get clean data from - * a scrape. - * <p/> - * Note that this is a fairly simplistic formatter -- for real world use you'll - * want to embrace and extend. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class HtmlToPlainText { - public static void main(String... args) throws IOException { - Validate.isTrue(args.length == 1, "usage: supply url to fetch"); - String url = args[0]; - - // fetch the specified URL and parse to a HTML DOM - Document doc = Jsoup.connect(url).get(); - - HtmlToPlainText formatter = new HtmlToPlainText(); - String plainText = formatter.getPlainText(doc); - System.out.println(plainText); - } - - /** - * Format an Element to plain-text - * - * @param element - * the root element to format - * @return formatted text - */ - public String getPlainText(Element element) { - FormattingVisitor formatter = new FormattingVisitor(); - NodeTraversor traversor = new NodeTraversor(formatter); - traversor.traverse(element); // walk the DOM, and call .head() and - // .tail() for each node - - return formatter.toString(); - } - - // the formatting rules, implemented in a breadth-first DOM traverse - private class FormattingVisitor implements NodeVisitor { - private static final int maxWidth = 80; - private int width = 0; - private StringBuilder accum = new StringBuilder(); // holds the - // accumulated text - - // hit when the node is first seen - @Override - public void head(Node node, int depth) { - String name = node.nodeName(); - if (node instanceof TextNode) { - append(((TextNode) node).text()); // TextNodes carry all - // user-readable text in the - // DOM. - } else if (name.equals("li")) { - append("\n * "); - } - } - - // hit when all of the node's children (if any) have been visited - @Override - public void tail(Node node, int depth) { - String name = node.nodeName(); - if (name.equals("br")) { - append("\n"); - } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5")) { - append("\n\n"); - } else if (name.equals("a")) { - append(String.format(" <%s>", node.absUrl("href"))); - } - } - - // appends text to the string builder with a simple word wrap method - private void append(String text) { - if (text.startsWith("\n")) { - width = 0; // reset counter if starts with a newline. only from - // formats above, not in natural text - } - if (text.equals(" ") - && (accum.length() == 0 || StringUtil.in( - accum.substring(accum.length() - 1), " ", "\n"))) { - return; // don't accumulate long runs of empty spaces - } - - if (text.length() + width > maxWidth) { // won't fit, needs to wrap - String words[] = text.split("\\s+"); - for (int i = 0; i < words.length; i++) { - String word = words[i]; - boolean last = i == words.length - 1; - if (!last) { - word = word + " "; - } - if (word.length() + width > maxWidth) { // wrap and reset - // counter - accum.append("\n").append(word); - width = word.length(); - } else { - accum.append(word); - width += word.length(); - } - } - } else { // fits as is, without need to wrap text - accum.append(text); - width += text.length(); - } - } - - @Override - public String toString() { - return accum.toString(); - } - } -} |