summaryrefslogtreecommitdiffstats
path: root/vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb')
-rw-r--r--vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb305
1 files changed, 305 insertions, 0 deletions
diff --git a/vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb b/vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb
new file mode 100644
index 000000000..b20810975
--- /dev/null
+++ b/vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb
@@ -0,0 +1,305 @@
+# = HTMLTokenizer
+#
+# Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
+# Copyright:: Copyright (c) 2004 Ben Giddings
+# License:: Distributes under the same terms as Ruby
+#
+#
+# This is a partial port of the functionality behind Perl's TokeParser
+# Provided a page it progressively returns tokens from that page
+#
+# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
+
+#
+# A class to tokenize HTML.
+#
+# Example:
+#
+# page = "<HTML>
+# <HEAD>
+# <TITLE>This is the title</TITLE>
+# </HEAD>
+# <!-- Here comes the <a href=\"missing.link\">blah</a>
+# comment body
+# -->
+# <BODY>
+# <H1>This is the header</H1>
+# <P>
+# This is the paragraph, it contains
+# <a href=\"link.html\">links</a>,
+# <img src=\"blah.gif\" optional alt='images
+# are
+# really cool'>. Ok, here is some more text and
+# <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
+# </P>
+# </body>
+# </HTML>
+# "
+# toke = HTMLTokenizer.new(page)
+#
+# assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
+# assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
+# assert("links" == toke.getTrimmedText)
+# assert(toke.getTag("IMG", "A").attr_hash['optional'])
+# assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
+#
+class HTMLTokenizer
+ @@version = 1.0
+
+ # Get version of HTMLTokenizer lib
+ def self.version
+ @@version
+ end
+
+ attr_reader :page
+
+ # Create a new tokenizer, based on the content, used as a string.
+ def initialize(content)
+ @page = content.to_s
+ @cur_pos = 0
+ end
+
+ # Reset the parser, setting the current position back at the stop
+ def reset
+ @cur_pos = 0
+ end
+
+ # Look at the next token, but don't actually grab it
+ def peekNextToken
+ if @cur_pos == @page.length then return nil end
+
+ if ?< == @page[@cur_pos]
+ # Next token is a tag of some kind
+ if '!--' == @page[(@cur_pos + 1), 3]
+ # Token is a comment
+ tag_end = @page.index('-->', (@cur_pos + 1))
+ if tag_end.nil?
+ raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
+ end
+ # p @page[@cur_pos .. (tag_end+2)]
+ HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
+ else
+ # Token is a html tag
+ tag_end = @page.index('>', (@cur_pos + 1))
+ if tag_end.nil?
+ raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
+ end
+ # p @page[@cur_pos .. tag_end]
+ HTMLTag.new(@page[@cur_pos .. tag_end])
+ end
+ else
+ # Next token is text
+ text_end = @page.index('<', @cur_pos)
+ text_end = text_end.nil? ? -1 : (text_end - 1)
+ # p @page[@cur_pos .. text_end]
+ HTMLText.new(@page[@cur_pos .. text_end])
+ end
+ end
+
+ # Get the next token, returns an instance of
+ # * HTMLText
+ # * HTMLToken
+ # * HTMLTag
+ def getNextToken
+ token = peekNextToken
+ if token
+ # @page = @page[token.raw.length .. -1]
+ # @page.slice!(0, token.raw.length)
+ @cur_pos += token.raw.length
+ end
+ #p token
+ #print token.raw
+ return token
+ end
+
+ # Get a tag from the specified set of desired tags.
+ # For example:
+ # <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
+ # Will return the next header tag encountered.
+ def getTag(*sought_tags)
+ sought_tags.collect! {|elm| elm.downcase}
+
+ while (tag = getNextToken)
+ if tag.kind_of?(HTMLTag) and
+ (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
+ break
+ end
+ end
+ tag
+ end
+
+ # Get all the text between the current position and the next tag
+ # (if specified) or a specific later tag
+ def getText(until_tag = nil)
+ if until_tag.nil?
+ if ?< == @page[@cur_pos]
+ # Next token is a tag, not text
+ ""
+ else
+ # Next token is text
+ getNextToken.text
+ end
+ else
+ ret_str = ""
+
+ while (tag = peekNextToken)
+ if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
+ break
+ end
+
+ if ("" != tag.text)
+ ret_str << (tag.text + " ")
+ end
+ getNextToken
+ end
+
+ ret_str
+ end
+ end
+
+ # Like getText, but squeeze all whitespace, getting rid of
+ # leading and trailing whitespace, and squeezing multiple
+ # spaces into a single space.
+ def getTrimmedText(until_tag = nil)
+ getText(until_tag).strip.gsub(/\s+/m, " ")
+ end
+
+end
+
+class HTMLTokenizerError < Exception
+end
+
+# The parent class for all three types of HTML tokens
+class HTMLToken
+ attr_accessor :raw
+
+ # Initialize the token based on the raw text
+ def initialize(text)
+ @raw = text
+ end
+
+ # By default, return exactly the string used to create the text
+ def to_s
+ raw
+ end
+
+ # By default tokens have no text representation
+ def text
+ ""
+ end
+
+ def trimmed_text
+ text.strip.gsub(/\s+/m, " ")
+ end
+
+ # Compare to another based on the raw source
+ def ==(other)
+ raw == other.to_s
+ end
+end
+
+# Class representing text that isn't inside a tag
+class HTMLText < HTMLToken
+ def text
+ raw
+ end
+end
+
+# Class representing an HTML comment
+class HTMLComment < HTMLToken
+ attr_accessor :contents
+ def initialize(text)
+ super(text)
+ temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
+ if temp_arr[0].nil?
+ raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
+ end
+
+ @contents = temp_arr[0][0]
+ end
+end
+
+# Class representing an HTML tag
+class HTMLTag < HTMLToken
+ attr_reader :end_tag, :tag_name
+ def initialize(text)
+ super(text)
+ if ?< != text[0] or ?> != text[-1]
+ raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
+ end
+
+ @attr_hash = Hash.new
+ @raw = text
+
+ tag_name = text.scan(/[\w:-]+/)[0]
+ if tag_name.nil?
+ raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"
+ end
+
+ if ?/ == text[1]
+ # It's an end tag
+ @end_tag = true
+ @tag_name = '/' + tag_name.downcase
+ else
+ @end_tag = false
+ @tag_name = tag_name.downcase
+ end
+
+ @hashed = false
+ end
+
+ # Retrieve a hash of all the tag's attributes.
+ # Lazily done, so that if you don't look at a tag's attributes
+ # things go quicker
+ def attr_hash
+ # Lazy initialize == don't build the hash until it's needed
+ if !@hashed
+ if !@end_tag
+ # Get the attributes
+ attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]
+ if attr_arr.kind_of?(Array)
+ # Attributes found, parse them
+ attrs = attr_arr[0]
+ attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
+ # clean up the array by:
+ # * setting all nil elements to true
+ # * removing enclosing quotes
+ attr_arr.each {
+ |item|
+ val = if item[1].nil?
+ item[0]
+ elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
+ item[1][1 .. -2]
+ else
+ item[1]
+ end
+ @attr_hash[item[0].downcase] = val
+ }
+ end
+ end
+ @hashed = true
+ end
+
+ #p self
+
+ @attr_hash
+ end
+
+ # Get the 'alt' text for a tag, if it exists, or an empty string otherwise
+ def text
+ if !end_tag
+ case tag_name
+ when 'img'
+ if !attr_hash['alt'].nil?
+ return attr_hash['alt']
+ end
+ when 'applet'
+ if !attr_hash['alt'].nil?
+ return attr_hash['alt']
+ end
+ end
+ end
+ return ''
+ end
+end
+