***** infoCopter.com/perl *****
Parse HTML
HTML::LinkExtor
#!/usr/bin/perl -w
# Usage: {SCRIPT_NAME} {URL}
use strict;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
my $ua = LWP::UserAgent->new(agent => "my agent V1.00");
my $request = HTTP::Request->new('GET', $ARGV[0]);
my $response = $ua->request($request);
unless ($response->is_success) {
print $response->error_as_HTML . "\n";
exit(1);
}
my $res = $response->content(); # content without HTTP header
my @imgs = ();
my @hrefs = ();
# Make the parser. Unfortunately, we don't know the base yet
# (it might be diffent from $url)
my $p = HTML::LinkExtor->new(\&callback);
$p->parse($res);
# Expand all image URLs to absolute ones
my $base = $response->base;
@imgs = map { $_ = url($_, $base)->abs; } @imgs;
print "Images:\n";
foreach (@imgs) { print "----> $_\n" }
print "\nLinks:\n";
foreach (@hrefs) { print "----> $_\n" }
sub callback {
my($tag, %attr) = @_;
push(@imgs , values %attr) if $tag eq 'img';
push(@hrefs, values %attr) if $tag eq 'a';
}
|