diff options
Diffstat (limited to 'vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb')
-rw-r--r-- | vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb | 305 |
1 files changed, 305 insertions, 0 deletions
diff --git a/vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb b/vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb new file mode 100644 index 000000000..b20810975 --- /dev/null +++ b/vendor/gems/ruby-openid-2.1.4/lib/openid/yadis/htmltokenizer.rb @@ -0,0 +1,305 @@ +# = HTMLTokenizer +# +# Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com) +# Copyright:: Copyright (c) 2004 Ben Giddings +# License:: Distributes under the same terms as Ruby +# +# +# This is a partial port of the functionality behind Perl's TokeParser +# Provided a page it progressively returns tokens from that page +# +# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $ + +# +# A class to tokenize HTML. +# +# Example: +# +# page = "<HTML> +# <HEAD> +# <TITLE>This is the title</TITLE> +# </HEAD> +# <!-- Here comes the <a href=\"missing.link\">blah</a> +# comment body +# --> +# <BODY> +# <H1>This is the header</H1> +# <P> +# This is the paragraph, it contains +# <a href=\"link.html\">links</a>, +# <img src=\"blah.gif\" optional alt='images +# are +# really cool'>. Ok, here is some more text and +# <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>. +# </P> +# </body> +# </HTML> +# " +# toke = HTMLTokenizer.new(page) +# +# assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase) +# assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A")) +# assert("links" == toke.getTrimmedText) +# assert(toke.getTag("IMG", "A").attr_hash['optional']) +# assert("_blank" == toke.getTag("IMG", "A").attr_hash['target']) +# +class HTMLTokenizer + @@version = 1.0 + + # Get version of HTMLTokenizer lib + def self.version + @@version + end + + attr_reader :page + + # Create a new tokenizer, based on the content, used as a string. + def initialize(content) + @page = content.to_s + @cur_pos = 0 + end + + # Reset the parser, setting the current position back at the stop + def reset + @cur_pos = 0 + end + + # Look at the next token, but don't actually grab it + def peekNextToken + if @cur_pos == @page.length then return nil end + + if ?< == @page[@cur_pos] + # Next token is a tag of some kind + if '!--' == @page[(@cur_pos + 1), 3] + # Token is a comment + tag_end = @page.index('-->', (@cur_pos + 1)) + if tag_end.nil? + raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}" + end + # p @page[@cur_pos .. (tag_end+2)] + HTMLComment.new(@page[@cur_pos .. (tag_end + 2)]) + else + # Token is a html tag + tag_end = @page.index('>', (@cur_pos + 1)) + if tag_end.nil? + raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}" + end + # p @page[@cur_pos .. tag_end] + HTMLTag.new(@page[@cur_pos .. tag_end]) + end + else + # Next token is text + text_end = @page.index('<', @cur_pos) + text_end = text_end.nil? ? -1 : (text_end - 1) + # p @page[@cur_pos .. text_end] + HTMLText.new(@page[@cur_pos .. text_end]) + end + end + + # Get the next token, returns an instance of + # * HTMLText + # * HTMLToken + # * HTMLTag + def getNextToken + token = peekNextToken + if token + # @page = @page[token.raw.length .. -1] + # @page.slice!(0, token.raw.length) + @cur_pos += token.raw.length + end + #p token + #print token.raw + return token + end + + # Get a tag from the specified set of desired tags. + # For example: + # <tt>foo = toke.getTag("h1", "h2", "h3")</tt> + # Will return the next header tag encountered. + def getTag(*sought_tags) + sought_tags.collect! {|elm| elm.downcase} + + while (tag = getNextToken) + if tag.kind_of?(HTMLTag) and + (0 == sought_tags.length or sought_tags.include?(tag.tag_name)) + break + end + end + tag + end + + # Get all the text between the current position and the next tag + # (if specified) or a specific later tag + def getText(until_tag = nil) + if until_tag.nil? + if ?< == @page[@cur_pos] + # Next token is a tag, not text + "" + else + # Next token is text + getNextToken.text + end + else + ret_str = "" + + while (tag = peekNextToken) + if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag + break + end + + if ("" != tag.text) + ret_str << (tag.text + " ") + end + getNextToken + end + + ret_str + end + end + + # Like getText, but squeeze all whitespace, getting rid of + # leading and trailing whitespace, and squeezing multiple + # spaces into a single space. + def getTrimmedText(until_tag = nil) + getText(until_tag).strip.gsub(/\s+/m, " ") + end + +end + +class HTMLTokenizerError < Exception +end + +# The parent class for all three types of HTML tokens +class HTMLToken + attr_accessor :raw + + # Initialize the token based on the raw text + def initialize(text) + @raw = text + end + + # By default, return exactly the string used to create the text + def to_s + raw + end + + # By default tokens have no text representation + def text + "" + end + + def trimmed_text + text.strip.gsub(/\s+/m, " ") + end + + # Compare to another based on the raw source + def ==(other) + raw == other.to_s + end +end + +# Class representing text that isn't inside a tag +class HTMLText < HTMLToken + def text + raw + end +end + +# Class representing an HTML comment +class HTMLComment < HTMLToken + attr_accessor :contents + def initialize(text) + super(text) + temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m) + if temp_arr[0].nil? + raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment" + end + + @contents = temp_arr[0][0] + end +end + +# Class representing an HTML tag +class HTMLTag < HTMLToken + attr_reader :end_tag, :tag_name + def initialize(text) + super(text) + if ?< != text[0] or ?> != text[-1] + raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment" + end + + @attr_hash = Hash.new + @raw = text + + tag_name = text.scan(/[\w:-]+/)[0] + if tag_name.nil? + raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}" + end + + if ?/ == text[1] + # It's an end tag + @end_tag = true + @tag_name = '/' + tag_name.downcase + else + @end_tag = false + @tag_name = tag_name.downcase + end + + @hashed = false + end + + # Retrieve a hash of all the tag's attributes. + # Lazily done, so that if you don't look at a tag's attributes + # things go quicker + def attr_hash + # Lazy initialize == don't build the hash until it's needed + if !@hashed + if !@end_tag + # Get the attributes + attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0] + if attr_arr.kind_of?(Array) + # Attributes found, parse them + attrs = attr_arr[0] + attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m) + # clean up the array by: + # * setting all nil elements to true + # * removing enclosing quotes + attr_arr.each { + |item| + val = if item[1].nil? + item[0] + elsif '"'[0] == item[1][0] or '\''[0] == item[1][0] + item[1][1 .. -2] + else + item[1] + end + @attr_hash[item[0].downcase] = val + } + end + end + @hashed = true + end + + #p self + + @attr_hash + end + + # Get the 'alt' text for a tag, if it exists, or an empty string otherwise + def text + if !end_tag + case tag_name + when 'img' + if !attr_hash['alt'].nil? + return attr_hash['alt'] + end + when 'applet' + if !attr_hash['alt'].nil? + return attr_hash['alt'] + end + end + end + return '' + end +end + |