########################################### package RssMaker; ########################################### # Mike Schilli, 2004 (m@perlmeister.com) ########################################### use warnings; use strict; use LWP::UserAgent; use HTTP::Request::Common; use XML::RSS; use HTML::Entities qw(decode_entities); use URI::URL; use HTTP::Date; use DateTime; use HTML::TreeBuilder; use Log::Log4perl qw(:easy); ########################################### sub make { ########################################### my(%o) = @_; $o{url} || LOGDIE "url missing"; $o{title} || LOGDIE "title missing"; $o{output} ||= "out.rdf"; $o{filter} ||= sub { 1 }; $o{encoding} ||= 'utf-8'; my $ua = LWP::UserAgent->new(); INFO "Fetching $o{url}"; my $resp = $ua->request(GET $o{url}); LOGDIE "Fetching $o{url} failed" if $resp->is_error(); my $http_time = $resp->header('last-modified'); INFO "Last modified: $http_time"; my $mtime = str2time($http_time); my $isotime = DateTime->from_epoch( epoch => $mtime); DEBUG "Last modified: $isotime"; my $rss = XML::RSS->new( encoding => $o{encoding}); $rss->channel( title => $o{title}, link => $o{url}, dc => { date => $isotime . "Z"}, ); foreach(exlinks($resp->content(), $o{url})) { my($lurl, $text) = @$_; $text = decode_entities($text); if($o{filter}->($lurl, $text)) { INFO "Adding rss entry: $text $lurl"; $rss->add_item( title => $text, link => $lurl, ); } } INFO "Saving output in $o{output}"; $rss->save($o{output}) or die "Cannot write to $o{output}"; } ########################################### sub exlinks { ########################################### my($html, $base_url) = @_; my @links = (); my $tree = HTML::TreeBuilder->new(); $tree->parse($html) or return (); for(@{$tree->extract_links('a')}) { my($link, $element, $attr, $tag) = @$_; next unless $attr eq "href"; my $uri = URI->new_abs($link, $base_url); next unless length $element->as_trimmed_text(); push @links, [URI->new_abs($link, $base_url), $element->as_trimmed_text()]; } return @links; } 1;