require 'net/http' class Article def initialize @host = 'www.linux-magazin.de' @baseurl = '/Artikel/ausgabe/' end def parse_articles(text) return nil unless text.is_a?(String) articles = text.split(%r{]*>}i)[1] return nil unless articles articles = articles.split(%r{}i)[0].split(%r{]*>}i) return nil unless articles.length > 1 return articles[1...articles.length] end def load_article_page(year, month) address = get_address(year, month, '/index_html?print=y') http = createConnection resp, data = http.get(address) return data end def read_articles(year, month) begin html = load_article_page(year, month) rescue SocketError return 'Netzwerkfehler' end articles = parse_articles(html) return 'keine Artikel gefunden' unless articles articles.collect { |html| [ extract_article_text(html), extract_article_link(html, get_address(year, month)) ] } if articles end private def createConnection Net::HTTP.new(@host) end def get_address(year, month, suffix=nil) sprintf("http://%s%s%s/%02d%s", @host, @baseurl, year, month, suffix) end def extract_article_text(text) text.gsub(/<[^>]+>/, '').strip end def extract_article_link(text, base) url = %r{href="(.*)"}i.match(text) return base + '/' + url[1] if url.is_a?(MatchData) end end if __FILE__ == $0 article = Article.new results = article.read_articles(2004,5) results.each { |result| puts "---\n" + result[0] puts result[1] + "\n" if result[1] } end