Class | String |
In: |
lib/feedparser/textconverters.rb
lib/feedparser/text-output.rb |
Parent: | Object |
This class provides various converters
MY_ENTITIES | = | {} |
# File lib/feedparser/textconverters.rb, line 17 17: def escape_html 18: r = self.gsub('&', '&') 19: r = r.gsub('<', '<') 20: r = r.gsub('>', '>') 21: r 22: end
returns true if the text contains escaped HTML (with HTML entities). used by String#text2html
# File lib/feedparser/textconverters.rb, line 13 13: def escaped_html? 14: return (self =~ /<img src=/i) || (self =~ /<a href=/i) || (self =~ /<br(\/| \/|)>/i) || (self =~ /<p>/i) 15: end
Convert an HTML text to plain text
# File lib/feedparser/text-output.rb, line 7 7: def html2text 8: text = self.clone 9: # parse HTML 10: p = FeedParser::HTML2TextParser::new(true) 11: p.feed(text) 12: p.close 13: text = p.savedata 14: # remove leading and trailing whilespace 15: text.gsub!(/\A\s*/m, '') 16: text.gsub!(/\s*\Z/m, '') 17: # remove whitespace around \n 18: text.gsub!(/ *\n/m, "\n") 19: text.gsub!(/\n */m, "\n") 20: # and duplicates \n 21: text.gsub!(/\n\n+/m, "\n\n") 22: # and remove duplicated whitespace 23: text.gsub!(/[ \t]+/, ' ') 24: text 25: end
is this text HTML ? search for tags. used by String#text2html
# File lib/feedparser/textconverters.rb, line 8 8: def html? 9: return (self =~ /<p>/i) || (self =~ /<\/p>/i) || (self =~ /<br>/i) || (self =~ /<br\s*(\/)?\s*>/i) || (self =~ /<\/a>/i) || (self =~ /<img.*>/i) 10: end
Remove white space around the text
# File lib/feedparser/textconverters.rb, line 95 95: def rmWhiteSpace! 96: return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'') 97: end
convert text to HTML
# File lib/feedparser/textconverters.rb, line 40 40: def text2html(feed) 41: text = self.clone 42: realhtml = text.html? 43: eschtml = text.escaped_html? 44: # fix for RSS feeds with both real and escaped html (crazy!): 45: # we take the first one 46: if (realhtml && eschtml) 47: if (realhtml < eschtml) 48: eschtml = nil 49: else 50: realhtml = nil 51: end 52: end 53: if realhtml 54: # do nothing 55: elsif eschtml 56: text = text.unescape_html 57: else 58: # paragraphs 59: text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>') 60: text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>") 61: # uris 62: text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/, 63: '<a href="\1">\1</a>') 64: end 65: # Handle broken hrefs in <a> and <img> 66: if feed and feed.link 67: text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m| 68: begin 69: first, url, last = $1, $3, $4 70: if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/) 71: m 72: elsif url =~ /^\// 73: (first + feed.link.split(/\//)[0..2].join('/') + url + last) 74: else 75: t = feed.link.split(/\//) 76: if t.length == 3 # http://toto with no trailing / 77: (first + feed.link + '/' + url + last) 78: else 79: if feed.link =~ /\/$/ 80: (first + feed.link + url + last) 81: else 82: (first + t[0...-1].join('/') + '/' + url + last) 83: end 84: end 85: end 86: rescue 87: m 88: end 89: end 90: end 91: text 92: end
Convert a text in inputenc to a text in UTF8 must take care of wrong input locales
# File lib/feedparser/textconverters.rb, line 101 101: def toUTF8(inputenc) 102: if inputenc.downcase != 'utf-8' 103: # it is said it is not UTF-8. Ensure it is REALLY not UTF-8 104: begin 105: if self.unpack('U*').pack('U*') == self 106: return self 107: end 108: rescue 109: # do nothing 110: end 111: begin 112: return self.unpack('C*').pack('U*') 113: rescue 114: return self #failsafe solution. but a dirty one :-) 115: end 116: else 117: return self 118: end 119: end
un-escape HTML in the text. used by String#text2html
# File lib/feedparser/textconverters.rb, line 31 31: def unescape_html 32: r = self 33: MY_ENTITIES.each do |k, v| 34: r = r.gsub(k, v) 35: end 36: r 37: end