#!/usr/bin/perl -w use strict; $| = 1; ################################ # rss_2_0.pl # # URL: http://www.infocopter.com/perl/rss.html # Location: quartus # # Usage: # rss_2_0.pl [Options] rssinput.xml "3" >output_top3.html # my $VERSION = '0.05.02'; ################################ use XML::GDOME; use Unicode::String qw(latin1 utf8); # For Umlaute problems -> web_enc / toISO use Getopt::Long; # -- GLOBAL my $DEBUG = 0; my $do = my $i = 0; my %META = (); # -- Don't make a target _blank to this domains: my @HOME_DOMAINS = qw(infocopter.com pgate.net); my @getopt_args = ( 'd', # debug 'debug' , # debug mode for development support 'omit_tags=s' , # e.g. img,a,h1 'h', # help 'proxy=s', # proxy host 'v', # Verbose mode ); my %Options; Getopt::Long::config("noignorecase", "bundling"); print "\n"; &Usage() unless GetOptions(\%Options, @getopt_args); if ($Options{'d'} or $Options{'debug'}) { foreach (keys %Options) { print "- $_ = \"$Options{$_}\"\n"; } } $Options{'omit_tags'} ||= ''; my $doc = XML::GDOME->createDocFromURI($ARGV[0] || '/var/www/html/rss/xml/linuxjournal.xml'); my @nodes = $doc->findnodes("//*"); &main(); ############################################### sub main() { ############################################### my @row_color = (); $row_color[0] = 'white'; $row_color[1] = '#e9e9e9'; #print ''; print ''; my $i = 0; my $max = $ARGV[1] || 0; foreach (&parseRSS()) { my $hash_ref = $_; next if $hash_ref->{'title'} =~ /^ADV:/; # Skip advertisting last if (++$i > $max) && $max; (my $description = $hash_ref->{'description'}) =~ s/^

//gi; $description =~ s/<\/p>$//gi; $description =~ s/<\/p>/

/gi; $description =~ s/

//gi; $description =~ s/’/'/g; $description =~ s/é/\é/g; $description =~ s/è/\è/g; if (&isISO($description)) { $description = &web_enc($description); } else { $description = &web_enc(&toISO($description)); } my @omit_tag_arr = split /,/, $Options{'omit_tags'}; foreach (@omit_tag_arr) { $description =~ s/\<$_/\<$_/g; } if ($hash_ref->{'__nodeName'} !~ /description/) { # Something else, probably the disclaimer section but to be generic we catch all ;-) print '

', ''; $i--; next; } else { print '', "\n\n"; } my $meta_row = ''; if (defined $META{'copyright'}) { $meta_row = $META{'copyright'}; } if (defined $META{'managingEditor'}) { $meta_row .= "
Managing Editor"; } if ($meta_row) { print ''; } #print '
', scalar localtime, '
'; if (defined $hash_ref->{'url'}) { print '', '',
					$hash_ref->{'title'}, ''; } else { foreach (keys %{$hash_ref}) { print "$_ = ", $hash_ref->{$_}, '
'; } } print '
'; } my $title = $hash_ref->{'title'}; $title =~ s/’/'/g; $title =~ s/é/\é/g; $title =~ s/è/\è/g; if (&isISO($title)) { $title = &web_enc($title); } else { $title = &web_enc(&toISO($title)); } print '', $title, '
', '', $hash_ref->{'pubDate'}, '
', $description, '

', "\n"; if ($hash_ref->{'category'}) { print '› ', $hash_ref->{'category'}, ' '; } (my $link_disp = $hash_ref->{'link'}) =~ s/^https?:\/\///i; $link_disp =~ s/^www\.//i; if (length($link_disp) > 60) { $link_disp = substr($link_disp, 0, 60) . '...'; } my $target = ' target="_blank"'; # default linking my $uarr = ' ↑'; foreach (@HOME_DOMAINS) { if ($hash_ref->{'link'} =~ /$_/) { $target = $uarr = ''; last; } } print "› {'link'}, '">', $link_disp, "$uarr
"; print '
'; # foreach (keys %META) { print "$_ = $META{$_}
"; } print '
', $meta_row, ''; print '
'; } ############################################### sub parseRSS() { ############################################### my @results = (); foreach my $node (@nodes) { my @childs = $node->childNodes; my %hash = (); foreach my $child (@childs) { next unless $child->nodeType == ELEMENT_NODE; my $data = defined $child->firstChild() ? $child->firstChild()->data : 'NULL'; $META{$child->nodeName} = $data; if ($child->nodeName eq 'title') { $i++; $do = 1 if $i > 1; } next unless $do; chomp $data; print "[DEBUG] node = '", $child->nodeName, "'\n" if $DEBUG; $hash{$child->nodeName} = $data; $hash{'__nodeName'} .= $child->nodeName . '
'; if (defined $child->getAttributeNode("domain")) { # -- nodeName . "\tdomain"} = $href->getValue(); } } # -- Put an atomic RSS entry to the array and empty the hash my %foohash = %hash; $foohash{'pubDate'} ||= ''; # * No date * $foohash{'title' } ||= '* No title *'; $foohash{'description'} ||= '* No description *'; $foohash{'category'} ||= ''; # empty! my $categ_url = $foohash{'category'} ? 'http://groups.google.ch/groups?hl=de&q=' . $foohash{'category'} : ''; $foohash{"category\tdomain"} ||= $categ_url; push(@results, \%foohash) if %hash; %hash = (); } @results; } sub web_enc ($) { my $enc = ''; for (my $i = 0; $i < length($_[0]); $i++) { my $ordno = ord substr($_[0], $i, 1); $enc .= $ordno > 127 ? sprintf("&#%d;", $ordno) : substr($_[0], $i, 1); } $enc =~ s/ $//;; $enc; } sub toISO($) { my $text = $_[0]; # if this host was UTF-8 encoded: my $text_iso = (utf8($text))->latin1; my $text_utf8 = (latin1($text_iso))->utf8; # reverse check if ($text ne $text_utf8) { #print STDERR "Unequal reverse check! It seems your input data \"$text\" is ", # "ISO encoded already, so you don't need the latin1 encoding stuff here!\n"; return $text; } $text_iso; } sub isISO ($) { my $text = $_[0]; local $SIG{'__WARN__'} = \&alarm_handler; # install signal handler my $text_iso = (utf8($text))->latin1; my $text_utf8 = (latin1($text_iso))->utf8; # reverse check return $text eq $text_utf8 ? 0 : 1; } sub alarm_handler () { #print STDERR "alarm catched!\n"; return; } sub Usage() { print "Wrong usage.\n"; exit(0); } __END__