cvs commit: www/test/lfs lfs2rss.pl

jeroen at linuxfromscratch.org jeroen at linuxfromscratch.org
Tue Aug 5 09:39:25 PDT 2003


jeroen      03/08/05 10:39:25

  Added:       test     lfs2rss.pl
               test/images rss.png
  Removed:     test/blfs blfs2rss.pl
               test/lfs lfs2rss.pl
  Log:
  Script changes, now for real.
  
  Revision  Changes    Path
  1.1                  www/test/lfs2rss.pl
  
  Index: lfs2rss.pl
  ===================================================================
  #!/usr/bin/perl -w
  # Author: Rob Park <rbpark at ualberta.ca>
  # License: GNU General Public License
  
  use XML::RSS;
  use Getopt::Long;
  use Pod::Usage;
  use strict;
  
  # make a new XML::RSS object
  my $RSS = new XML::RSS (version => '2.0');
  
  # default arguments for commandline switches
  my %args = 
  ( 
  	"help" => undef,
  	"man" => undef,
  	"news-file" => "news.html",
  	"rss-file" => "feed.rss"
  );
  
  # parse commandline options, display help if needed
  GetOptions(\%args, 'help|?', 'man', 'news-file=s', 'rss-file=s');
  pod2usage(1) if $args{help};
  pod2usage(-exitstatus => 0, -verbose => 2) if ($args{man});
  
  my @items;
  my %channel = 
  (
  	link => "http://test.linuxfromscratch.org/",
  	language => "en-us"
  );
  
  sub sanitize
  {
  	my $string = shift;
  	$string =~ s/&(?!\w+;)/&/g;
  	$string =~ s/</</g;
  	$string =~ s/>/>/g;
  	return $string;
  }
  
  # open the HTML file and tell the user what's going on.
  print "Opening $args{'news-file'} for parsing...\n";
  open HTML, "<$args{'news-file'}";
  
  # parse the HTML file
  while (<HTML>)
  {
  	# figure out what page we're working on
  	m/<body id="body" class="(.+)">/
  		&& ($channel{link} .= "$1/$args{'news-file'}");
  
  	# extract the title of the page
  	m/<title>(.+)<\/title>/ 
  		&& ($channel{title} = sanitize($1));
  	
  	# extract the description from the meta tags
  	m/<meta name="description" content="(.+)"\s?\/>/ 
  		&& ($channel{description} = sanitize($1));
  	
  	# extract the news items
  	if (m/<h3 id="(.+)">(.+)<\/h3>/)
  	{
  		my %item;
  		$item{link} = "$channel{link}#$1";
  		$item{title} = sanitize($2);
  		
  		# ugly way of getting $channel{link} without $args{'news-file'} on the end of it
  		my $cwdir = $channel{link};
  		{ local $/ = $args{'news-file'}; chomp $cwdir; }
  	
  		# skip two lines down, to the first <p> tag
  		$item{description} = <HTML>;
  		$item{description} = <HTML>; 
  		$item{description} =~ s/^.*<p>(.+)<\/p>.*$/sanitize($1)/e;
  		$item{description} =~ s/(href|src)="((?!http|ftp|mailto).+)"/$1="$cwdir$2"/g;
  		chomp $item{description};
  
  		# add the newly parsed news item to the list of news items
  		push @items, \%item;
  	}
  }
  
  # insert the channel information into the RSS object
  $RSS->channel(%channel);
  
  # insert the news items into the RSS object
  for my $item (@items)
  {
  	$RSS->add_item(%{$item});
  }
  
  # save the RSS to a file and tell the user what's going on.
  print "Saving RSS feed to $args{'rss-file'}...\n";
  $RSS->save($args{'rss-file'});
  
  __END__
  
  =head1 NAME
  
  lfs2rss.pl - parse the LFS website and convert it into an RSS feed
  
  =head1 SYNOPSIS
  
  lfs2rss.pl [options]
  
  =head1 OPTIONS
  
  =over 8
  
  =item B<-h, --help>
  
  Print this help message.
  
  =item B<-m, --man>
  
  Output more verbose help in the form of a man page.
  
  =item B<-n, --news-file>
  
  Define the location of the HTML file to parse. Defaults to C<./news.html>.
  
  =item B<-r, --rss-file>
  
  Define the location of the RSS file out output. Defaults to C<./feed.rss>.
  
  =back
  
  =head1 VERSION
  
  $Id: lfs2rss.pl,v 1.1 2003/08/05 16:39:25 jeroen Exp $
  
  =head1 REPORTING BUGS
  
  Report bugs to <rbpark at ualberta.ca>.
  
  =cut
  
  
  
  1.1                  www/test/images/rss.png
  
  	<<Binary file>>
  
  



More information about the website mailing list