***** infoCopter.com/perl *****

Parse HTML

HTML::LinkExtor


#!/usr/bin/perl -w

#	Usage:	{SCRIPT_NAME} {URL}

use strict;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;

my $ua = LWP::UserAgent->new(agent => "my agent V1.00");

my $request  = HTTP::Request->new('GET', $ARGV[0]);
my $response = $ua->request($request);


unless ($response->is_success) {
	print $response->error_as_HTML . "\n";
	exit(1);
}

my $res = $response->content(); # content without HTTP header


my @imgs  = ();
my @hrefs = ();

# Make the parser.  Unfortunately, we don't know the base yet
# (it might be diffent from $url)
my $p = HTML::LinkExtor->new(\&callback);

$p->parse($res);


# Expand all image URLs to absolute ones
my $base = $response->base;
@imgs = map { $_ = url($_, $base)->abs; } @imgs;

print "Images:\n";
foreach (@imgs) { print "----> $_\n" }

print "\nLinks:\n";
foreach (@hrefs) { print "----> $_\n" }

sub callback {
     my($tag, %attr) = @_;

     push(@imgs , values %attr) if $tag eq 'img';
     push(@hrefs, values %attr) if $tag eq 'a';
}

© reto :)