Perl Basics
Introduction to Perl
FAQ's
CGI
Regular Expressions

PC Overview
Cool Stuff
My Modules
Success Stories
Links
Perl in the News
Logos

PC Internals
About
Contact
Handy Logos
What's new

Parse HTML

HTML::LinkExtor
P-friendly

[ home ] - [ search ] - [ sitemap ]



#!/usr/bin/perl -w

#	Usage:	{SCRIPT_NAME} {URL}

use strict;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;

my $ua = LWP::UserAgent->new(agent => "my agent V1.00");

my $request  = HTTP::Request->new('GET', $ARGV[0]);
my $response = $ua->request($request);


unless ($response->is_success) {
	print $response->error_as_HTML . "\n";
	exit(1);
}

my $res = $response->content(); # content without HTTP header


my @imgs  = ();
my @hrefs = ();

# Make the parser.  Unfortunately, we don't know the base yet
# (it might be diffent from $url)
my $p = HTML::LinkExtor->new(\&callback);

$p->parse($res);


# Expand all image URLs to absolute ones
my $base = $response->base;
@imgs = map { $_ = url($_, $base)->abs; } @imgs;

print "Images:\n";
foreach (@imgs) { print "----> $_\n" }

print "\nLinks:\n";
foreach (@hrefs) { print "----> $_\n" }

sub callback {
     my($tag, %attr) = @_;

     push(@imgs , values %attr) if $tag eq 'img';
     push(@hrefs, values %attr) if $tag eq 'a';
}

home - feedback - search

$Id: parsehtml.htm,v 1.6 2004/03/23 01:13:00 reto Exp $
© 1998-2004 reto :)