#!/usr/bin/perl ########################################### # googledrill - Explore and follow Google # results # Mike Schilli, 2002 (m@perlmeister.com) ########################################### use warnings; use strict; use Net::Google; use HTML::TreeBuilder; use LWP::Simple; use URI::URL; my $GOOGLE_SEARCH = 'Schilli'; my $HIT_EXCL_PATTERN = qr(perlmeister\.com); my $LINK_PATTERN = qr(perlmeister\.com); my $RESULTS_PER_PAGE = 100; my $RESULTS_TOTAL = 500; use constant LOCAL_GOOGLE_KEY => "XXX_INSERT_YOUR_OWN_GOOGLE_KEY_HERE_XXX"; my $service = Net::Google->new( key => LOCAL_GOOGLE_KEY, ); my %links_seen = (); my $hits_seen_total = 0; while($hits_seen_total < $RESULTS_TOTAL) { # Init search my $session = $service->search( max_results => $RESULTS_PER_PAGE, starts_at => $hits_seen_total); $session->query($GOOGLE_SEARCH); # Contact Google for results my @hits = @{($session->results())[0]}; # Iterate over results for my $hit (@hits) { my $url = norm_url($hit->URL()); # Eliminate unwanted sites next if $url =~ $HIT_EXCL_PATTERN; # Follow hit, retrieve site print "Getting $url\n"; for my $link (get_links($url)) { # Ignore self-links next if $link !~ $LINK_PATTERN; # Count link and push referrer push @{$links_seen{$link}}, $url; } } # Not enough results to continue? last if @hits < $RESULTS_PER_PAGE; $hits_seen_total += $RESULTS_PER_PAGE; } # Print results, highest counts first for my $link (sort { @{$links_seen{$b}} <=> @{$links_seen{$a}} } keys %links_seen) { print "$link (" . scalar @{$links_seen{$link}}, ")\n"; for my $site (@{$links_seen{$link}}) { print " $site\n"; } } ########################################### sub get_links { ########################################### my($url) = @_; my @links = (); # Retrieve remote document my $data = get($url); if(! defined $data) { warn "Cannot retrieve $url\n"; return @links; } # Extract links my $tree = HTML::TreeBuilder->new(); $tree->parse($data); my $ref = $tree->extract_links(qw/a/); if($ref) { @links = map { norm_url($_->[0]) } @$ref; } $tree->delete(); # Kick out dupes and return the list my %dupes; return grep { ! $dupes{$_}++ } @links; } ########################################### sub norm_url { ########################################### my($url_string) = @_; my $url = URI::URL->new($url_string); return $url->canonical()->as_string(); }