rss_2_0_simple.pl
Download: → /perl/ files/rss 2 0 simple pl.txt
#!/usr/bin/perl -w
use strict;
$| = 1;
################################
# rss_2_0.pl
################################
use XML::GDOME;
# -- GLOBAL
my $DEBUG = 0;
my $doc = XML::GDOME->createDocFromURI($ARGV[0]);
my @nodes = $doc->findnodes("//*");
my $do = my $i = 0;
foreach my $node (@nodes) {
my @childs = $node->childNodes;
foreach my $child (@childs) {
if($child->nodeType == ELEMENT_NODE) {
my $data = defined $child->firstChild() ?
$child->firstChild()->data : 'NULL';
if ($child->nodeName eq 'title') {
$i++;
$do = 1 if $i > 1;
}
next unless $do;
chomp $data;
print "[DEBUG] node = '", $child->nodeName, "'\n" if $DEBUG;
if ($child->nodeName eq 'title') {
print '<b>', $data, '</b>', "\n";
}
elsif ($child->nodeName eq 'link') {
print '<a href="', $data, '">', $data, '"</a>', "\n";
}
elsif ($child->nodeName eq 'description') {
print $data, '<br />', "\n";
}
elsif ($child->nodeName eq 'pubDate') {
print '<i>(', $data, ')<br /></i>';
print "<hr noshade>\n\n";
}
else {
print $data, '<br />';
}
}
}
}
__END__
rss_2_0.pl
Download: → /perl/ files/rss 2 0 pl.txt
#!/usr/bin/perl -w
use strict;
$| = 1;
################################
# rss_2_0.pl
#
# URL: http://www.infocopter.com/perl/rss.html
# Location: quartus
#
# Usage:
# rss_2_0.pl [Options] rssinput.xml "3" >output_top3.html
#
my $VERSION = '0.05.02';
################################
use XML::GDOME;
use Unicode::String qw(latin1 utf8); # For Umlaute problems -> web_enc / toISO
use Getopt::Long;
# -- GLOBAL
my $DEBUG = 0;
my $do = my $i = 0;
my %META = ();
# -- Don't make a target _blank to this domains:
my @HOME_DOMAINS = qw(infocopter.com pgate.net);
my @getopt_args = (
'd', # debug
'debug' , # debug mode for development support
'omit_tags=s' , # e.g. img,a,h1
'h', # help
'proxy=s', # proxy host
'v', # Verbose mode
);
my %Options;
Getopt::Long::config("noignorecase", "bundling");
print "\n";
&Usage() unless GetOptions(\%Options, @getopt_args);
if ($Options{'d'} or $Options{'debug'}) {
foreach (keys %Options) { print "- $_ = \"$Options{$_}\"\n"; }
}
$Options{'omit_tags'} ||= '';
my $doc = XML::GDOME->createDocFromURI($ARGV[0] || '/var/www/html/rss/xml/linuxjournal.xml');
my @nodes = $doc->findnodes("//*");
&main();
###############################################
sub main() {
###############################################
my @row_color = ();
$row_color[0] = 'white';
$row_color[1] = '#e9e9e9';
#print '<table cellspacing="0" cellpadding="3" border="0" style="border-color:black;">';
print '<tr><td>', scalar localtime, '<br /></td></tr>';
my $i = 0;
my $max = $ARGV[1] || 0;
foreach (&parseRSS()) {
my $hash_ref = $_;
next if $hash_ref->{'title'} =~ /^ADV:/; # Skip advertisting
last if (++$i > $max) && $max;
(my $description = $hash_ref->{'description'}) =~ s/^<p>//gi;
$description =~ s/<\/p>$//gi;
$description =~ s/<\/p>/<br \/><br \/>/gi;
$description =~ s/<p>//gi;
$description =~ s/’/'/g;
$description =~ s/é/\é/g;
$description =~ s/è/\è/g;
if (&isISO($description)) {
$description = &web_enc($description);
}
else {
$description = &web_enc(&toISO($description));
}
my @omit_tag_arr = split /,/, $Options{'omit_tags'};
foreach (@omit_tag_arr) {
$description =~ s/\<$_/\<$_/g;
}
if ($hash_ref->{'__nodeName'} !~ /description/) {
# Something else, probably the disclaimer section but to be generic we catch all ;-)
print '<tr bgcolor="white">',
'<td style="border-bottom-style:dashed; ',
'border-bottom-color:#cccccc; border-bottom-width:1px;">';
if (defined $hash_ref->{'url'}) {
print '<a target="_blank" href="', $hash_ref->{'link'}, '">',
'<img border="0" src="', $hash_ref->{'url'}, '" alt="',
$hash_ref->{'title'}, '"></a>';
}
else {
foreach (keys %{$hash_ref}) {
print "$_ = ", $hash_ref->{$_}, '<br />';
}
}
print '</td></tr>';
$i--;
next;
}
else {
print '<tr bgcolor="', $row_color[$i % 2],
'"><td style="border-bottom-style:dashed; ',
'border-bottom-color:#cccccc; border-bottom-width:1px;">';
}
my $title = $hash_ref->{'title'};
$title =~ s/’/'/g;
$title =~ s/é/\é/g;
$title =~ s/è/\è/g;
if (&isISO($title)) {
$title = &web_enc($title);
}
else {
$title = &web_enc(&toISO($title));
}
print '<span style="font-size:16px;font-weight:bold">', $title, '</span><br />',
'<span style="font-size:10px">', $hash_ref->{'pubDate'}, '</span><br />',
$description, '<br /> <br />', "\n";
if ($hash_ref->{'category'}) {
print '› <a href="', $hash_ref->{"category\tdomain"}, '">',
$hash_ref->{'category'}, '</a> ';
}
(my $link_disp = $hash_ref->{'link'}) =~ s/^https?:\/\///i;
$link_disp =~ s/^www\.//i;
if (length($link_disp) > 60) {
$link_disp = substr($link_disp, 0, 60) . '...';
}
my $target = ' target="_blank"'; # default linking
my $uarr = ' ↑';
foreach (@HOME_DOMAINS) {
if ($hash_ref->{'link'} =~ /$_/) {
$target = $uarr = ''; last;
}
}
print "› <a$target href=\"", $hash_ref->{'link'}, '">', $link_disp, "</a>$uarr<br />";
print '</td></tr>', "\n\n";
}
my $meta_row = '';
if (defined $META{'copyright'}) {
$meta_row = $META{'copyright'};
}
if (defined $META{'managingEditor'}) {
$meta_row .= "<br><a href=\"mailto:$META{'managingEditor'}\">Managing Editor</a>";
}
if ($meta_row) {
print '<tr bgcolor="white"><td style="border-bottom-style:dashed; border-width:0;">';
# foreach (keys %META) { print "$_ = $META{$_}<br />"; }
print '<br /><i>', $meta_row, '</i>';
print '</td></tr>';
}
#print '</table>';
}
###############################################
sub parseRSS() {
###############################################
my @results = ();
foreach my $node (@nodes) {
my @childs = $node->childNodes;
my %hash = ();
foreach my $child (@childs) {
next unless $child->nodeType == ELEMENT_NODE;
my $data = defined $child->firstChild() ?
$child->firstChild()->data : 'NULL';
$META{$child->nodeName} = $data;
if ($child->nodeName eq 'title') {
$i++;
$do = 1 if $i > 1;
}
next unless $do;
chomp $data;
print "[DEBUG] node = '", $child->nodeName, "'\n" if $DEBUG;
$hash{$child->nodeName} = $data;
$hash{'__nodeName'} .= $child->nodeName . '<br />';
if (defined $child->getAttributeNode("domain")) {
# -- <category domain="http://...
my $href = $child->getAttributeNode("domain");
$hash{$child->nodeName . "\tdomain"} = $href->getValue();
}
}
# -- Put an atomic RSS entry to the array and empty the hash
my %foohash = %hash;
$foohash{'pubDate'} ||= ''; # * No date *
$foohash{'title' } ||= '* No title *';
$foohash{'description'} ||= '* No description *';
$foohash{'category'} ||= ''; # empty!
my $categ_url = $foohash{'category'} ?
'http://groups.google.ch/groups?hl=de&q=' . $foohash{'category'} : '';
$foohash{"category\tdomain"} ||= $categ_url;
push(@results, \%foohash) if %hash;
%hash = ();
}
@results;
}
sub web_enc ($) {
my $enc = '';
for (my $i = 0; $i < length($_[0]); $i++) {
my $ordno = ord substr($_[0], $i, 1);
$enc .= $ordno > 127 ? sprintf("&#%d;", $ordno) : substr($_[0], $i, 1);
}
$enc =~ s/ $//;;
$enc;
}
sub toISO($) {
my $text = $_[0];
# if this host was UTF-8 encoded:
my $text_iso = (utf8($text))->latin1;
my $text_utf8 = (latin1($text_iso))->utf8; # reverse check
if ($text ne $text_utf8) {
#print STDERR "Unequal reverse check! It seems your input data \"$text\" is ",
# "ISO encoded already, so you don't need the latin1 encoding stuff here!\n";
return $text;
}
$text_iso;
}
sub isISO ($) {
my $text = $_[0];
local $SIG{'__WARN__'} = \&alarm_handler; # install signal handler
my $text_iso = (utf8($text))->latin1;
my $text_utf8 = (latin1($text_iso))->utf8; # reverse check
return $text eq $text_utf8 ? 0 : 1;
}
sub alarm_handler () {
#print STDERR "alarm catched!\n";
return;
}
sub Usage() {
print "Wrong usage.\n";
exit(0);
}
__END__