r442 - scripts/trunk

lizardo at linuxfromscratch.org lizardo at linuxfromscratch.org
Tue Jul 12 20:04:17 PDT 2005


Author: lizardo
Date: 2005-07-12 21:04:17 -0600 (Tue, 12 Jul 2005)
New Revision: 442

Added:
   scripts/trunk/lfs2rss.pl
Log:
Added original lfs2rss.pl from www.


Added: scripts/trunk/lfs2rss.pl
===================================================================
--- scripts/trunk/lfs2rss.pl	2005-07-13 03:01:52 UTC (rev 441)
+++ scripts/trunk/lfs2rss.pl	2005-07-13 03:04:17 UTC (rev 442)
@@ -0,0 +1,146 @@
+#!/usr/bin/perl -w
+# Author: Rob Park <rbpark at ualberta.ca>
+# License: GNU General Public License
+
+# Modified by Anderson Lizardo <lizardo at linuxfromscratch.org>
+# 2004-12-29: changed encoding to ISO-8859-1
+# 2004-12-12: added BUGS section
+# 2004-03-09: fixed news item URL creation
+
+use XML::RSS;
+use Getopt::Long;
+use Pod::Usage;
+use strict;
+
+# make a new XML::RSS object
+my $RSS = new XML::RSS (version => '2.0', encoding => 'ISO-8859-1');
+
+# default arguments for commandline switches
+my %args = 
+( 
+	"help" => undef,
+	"man" => undef,
+	"news-file" => "news.html",
+	"rss-file" => "feed.rss"
+);
+
+# parse commandline options, display help if needed
+GetOptions(\%args, 'help|?', 'man', 'news-file=s', 'rss-file=s');
+pod2usage(1) if $args{help};
+pod2usage(-exitstatus => 0, -verbose => 2) if ($args{man});
+
+my @items;
+my %channel = 
+(
+	link => "http://www.linuxfromscratch.org/",
+	language => "en-us"
+);
+
+sub sanitize
+{
+	my $string = shift;
+	$string =~ s/&(?!\w+;)/&/g;
+	$string =~ s/</</g;
+	$string =~ s/>/>/g;
+	return $string;
+}
+
+# open the HTML file and tell the user what's going on.
+#print "Opening $args{'news-file'} for parsing...\n";
+open HTML, "<$args{'news-file'}";
+
+# parse the HTML file
+while (<HTML>)
+{
+	# figure out what page we're working on
+	m/<body.*class="(.+)">/
+		&& ($channel{link} .= "$1/$args{'news-file'}");
+
+	# extract the title of the page
+	m/<title>(.+)<\/title>/ 
+		&& ($channel{title} = sanitize($1));
+	
+	# extract the description from the meta tags
+	m/<meta name="description" content="(.+)"\s?\/>/ 
+		&& ($channel{description} = sanitize($1));
+	
+	# extract the news items
+	if (m/<h3 id="([^"]+)"><a[^>]+>(.+)<\/a><\/h3>/)
+	{
+		my %item;
+		$item{link} = "$channel{link}#$1";
+		$item{title} = sanitize($2);
+		
+		# ugly way of getting $channel{link} without $args{'news-file'} on the end of it
+		my $cwdir = $channel{link};
+		{ local $/ = $args{'news-file'}; chomp $cwdir; }
+	
+		# skip two lines down, to the first <p> tag
+		$item{description} = <HTML>;
+		$item{description} = <HTML>; 
+		$item{description} =~ s/^.*<p>(.+)<\/p>.*$/sanitize($1)/e;
+		$item{description} =~ s/(href|src)="((?!http|ftp|mailto)[^"]+)"/$1="$cwdir$2"/g;
+		chomp $item{description};
+
+		# add the newly parsed news item to the list of news items
+		push @items, \%item;
+	}
+}
+
+# insert the channel information into the RSS object
+$RSS->channel(%channel);
+
+# insert the news items into the RSS object
+for my $item (@items)
+{
+	$RSS->add_item(%{$item});
+}
+
+# save the RSS to a file and tell the user what's going on.
+#print "Saving RSS feed to $args{'rss-file'}...\n";
+$RSS->save($args{'rss-file'});
+
+__END__
+
+=head1 NAME
+
+lfs2rss.pl - parse the LFS website and convert it into an RSS feed
+
+=head1 SYNOPSIS
+
+lfs2rss.pl [options]
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<-h, --help>
+
+Print this help message.
+
+=item B<-m, --man>
+
+Output more verbose help in the form of a man page.
+
+=item B<-n, --news-file>
+
+Define the location of the HTML file to parse. Defaults to C<./news.html>.
+
+=item B<-r, --rss-file>
+
+Define the location of the RSS file out output. Defaults to C<./feed.rss>.
+
+=back
+
+=head1 BUGS
+
+Due to a inherent limitation of the lfs2rss.pl script, news items should not
+have newlines inside <p>...</p> tags (at least not in the first paragraph).
+Such limitation can be avoided by using a XML/HTML parser instead of reading
+the HTML file line by line.
+
+=head1 REPORTING BUGS
+
+Report bugs to <rbpark at ualberta.ca>.
+
+=cut


Property changes on: scripts/trunk/lfs2rss.pl
___________________________________________________________________
Name: svn:executable
   + *




More information about the website mailing list