package IMDB; # Usage: # # use IMDB; # # my $imdb = IMDB->new( 'debug' => 1 ); # # %res = $imdb->query('title' => 'star trek'); # print "--> $_ = $res{$_}\n" foreach sort keys %res; # # %res = $imdb->query('title' => 'pulp fiction'); # print "--> $_ = $res{$_}\n" foreach sort keys %res; # ############################################################ # Lates release & documentation available on # # http://www.infocopter.com/perl_corner/retos-imdb.htm ############################################################ use strict; my $VERSION = '0.81.02'; use LWP::UserAgent; my $package = __PACKAGE__; ##### GLOBAL VARIABLES my $IMDB_Server = "us.imdb.com"; my $Genre_Err = '* Genre not available *'; my %Hash = (); my $contentType = ""; my $debug = 0; $| = 1; #----- FORWARD DECLARATIONS & PROTOTYPING sub Error($); sub Debug($); sub int_req (%); sub http_req (%); sub new { my $type = shift; my %params = @_; my $self = {}; $self->{'debug'} = $debug = $params{'debug'}; Debug "$package V$VERSION" if $self->{'debug'}; bless $self, $type; } sub query { my $self = shift; my %args = @_; Debug "$_ = $args{$_}" foreach keys %args; Debug "N1002: Expanding information for title '$args{'title'}'"; %Hash = (); # clean-up global hash my $title_encoded = $args{'title'}; $title_encoded =~ tr/ /+/; ##### check for title matches # my $pagedata = http_req url => "http://$IMDB_Server/Find?select=all&for=$title_encoded"; my @res_lines = split /\n/, $$pagedata; my $check = (grep { $_ =~ "'title' matches" } @res_lines)[0]; $Hash{'count'} = $check || 0; $Hash{'count'} =~ s/.*(\d+)?<\/B> 'title' matches.+/$1/; $Hash{'imdb_ref'} = $check || ""; $Hash{'imdb_ref'} =~ s/.*
  1. .+/$1/; my $first_page_title = $$pagedata || ""; $first_page_title =~ s/.*\(.+)?\<\/TITLE>.*/$1/si; $Hash{'genre'} = ""; ##### Transform common numbers to words to double the change my $title_num2word = "---"; if ($args{'title'} =~ /\d+/) { $title_num2word = $args{'title'}; $title_num2word =~ s/^7 /Seven /; $title_num2word =~ s/^12 /Twelve /; } $first_page_title =~ s/There's/There is/i; Debug "N1003: Comparing '$first_page_title' with '$args{'title'}' and '$title_num2word'..."; if ($first_page_title =~ /$args{'title'}/i or $first_page_title =~ /$title_num2word/i) { ##### title matched at first try!! $Hash{'imdb_ref'} = (grep { $_ =~ /^cache=const=/ } @res_lines)[0]; $Hash{'imdb_ref'} =~ s/^cache=const=(\d+)?/$1/; # leading zeros are necessary $Hash{'imdb_ref'} = substr("0000000", 0, 7 - length($Hash{'imdb_ref'}) ) . $Hash{'imdb_ref'}; Debug "N1009: Matched at first try: " . $Hash{'imdb_ref'}; ##### genre $Hash{'genre'} = (grep { $_ =~ /Genre/i } @res_lines)[0]; $Hash{'genre'} =~ s/<[^>]+>//g; # remove remaining HTML tags $Hash{'genre'} =~ s/\(more\)//; $Hash{'genre'} =~ s/Genre://; $Hash{'genre'} =~ s/ *(.+).*/$1/; # remove leading blanks ##### directed by my $i = 0; for (0 .. $#res_lines) { last if $res_lines[$i++] =~ /directed by/i; } $Hash{'directed_by'} = $res_lines[$i]; $Hash{'directed_by'} =~ s/<[^>]+>//g; # remove remaining HTML tags $Hash{'directed_by'} =~ s/\ //; $Hash{'directed_by'} =~ s/ *(.+).*/$1/; # remove leading blanks # optimize result $Hash{'imdb_title'} = $first_page_title; $Hash{'count' } = 1; # result is unique! } else { Debug "N1008: Not matched."; if ($first_page_title =~ /title search/) {; Debug "N1005: Received a list from IMDB..."; my $i = 0; for ($i = 0; $i < $#res_lines; $i++) { last if $res_lines[$i] =~ /found the following results/; } my $index = 0; for ($i = $i; $i < $#res_lines; $i++) { last if $res_lines[$i] =~ /\<\/OL>/i; my $line = $res_lines[$i]; $line =~ s/.*<LI><A HREF="\/Title\?(\d+)">(.+)?<\/A>.*/$1\t$2/; $Hash{"best_ref_" . ++$index} = $1; $Hash{"best_title_" . $index} = $2; Debug "N1011: $line"; } $Hash{'imdb_ref' } = $Hash{'best_ref_1' }; # assign best match $Hash{'imdb_title'} = $Hash{'best_title_1'}; # assign best match } } unless ($Hash{'genre'}) { ##### check for genre on the follow-up page # Debug "N1004: Check for genre on follow-up page..."; $Hash{'imdb_ref'} ||= ""; # prevent warnings $pagedata = http_req url => "http://$IMDB_Server/Title?$Hash{'imdb_ref'}"; @res_lines = split /\n/, $$pagedata; ##### genre $check = (grep { $_ =~ "Genre" } @res_lines)[0]; $Hash{'genre'} = $check || $Genre_Err; $Hash{'genre'} =~ s/<[^>]+>//g; # remove remaining HTML tags $Hash{'genre'} =~ s/\(more\)//; $Hash{'genre'} =~ s/Genre://; $Hash{'genre'} =~ s/ *(.+).*/$1/; # remove leading blanks $Hash{'genre'} = "<i>$Hash{'genre'}</i>"; Debug "N1010: $Hash{'genre'}"; ##### directed by my $i = 0; for (0 .. $#res_lines) { last if $res_lines[$i++] =~ /directed by/i; } $Hash{'directed_by'} = $res_lines[$i] || ""; $Hash{'directed_by'} =~ s/<[^>]+>//g; # remove remaining HTML tags $Hash{'directed_by'} =~ s/\ //; $Hash{'directed_by'} =~ s/ *(.+).*/$1/; # remove leading blanks } unless ($Hash{'imdb_title'}) { $Hash{'page_title_2'} = $$pagedata || ""; $Hash{'page_title_2'} =~ s/.*\<TITLE>(.+)?\<\/TITLE>.*/$1/si; } $Hash{'rc'} = $Hash{'genre'} eq $Genre_Err ? -1 : 0; ##### trim values $Hash{'genre' } = join(' ', grep({$_;} split(/ /, $Hash{'genre' }, 0))); $Hash{'directed_by'} = join(' ', grep({$_;} split(/ /, $Hash{'directed_by'}, 0))); %Hash; } sub http_req (%) { my %args = @_; my $r = HTTP::Request->new('GET', $args{'url'}); my $ua = LWP::UserAgent->new(agent => "Mozilla/4.04 [en] (Win2K; I ;Nav)"); # ;-) my $resp = $ua->request($r); Debug "N1000: http code = " . $resp->code() . " -> $args{'url'}"; Error $resp->error_as_HTML unless $resp->is_success; my $res = $resp->content(); # content without HTTP header return \$res; } sub Error ($) { print "Content-type: text/html\n\n" unless $contentType; $contentType = 1; print STDERR "*** ERROR at $package: $_[0]\n"; # do not exit, it's a Module! } sub Debug ($) { print "[ $package ] $_[0]\n" if $debug; } #### Used Warning / Error Codes ########################## # Next free W Code: 1000 # Next free E Code: 1000 # Next free N Code: 1012 1; <meta name="title2" content="IMDB.pm - Access to Media information"> <meta name="keywords" content="IMDB,DVD,Media"> # {%WRAPPER _|EQ|_ "_c_doc__w_plain.txt"%} # {%PREVENT_LAYOUT_PROCESS _|EQ|_ "0"%}