cvs commit: www/test manage_news.pl

jeroen at linuxfromscratch.org jeroen at linuxfromscratch.org
Mon Sep 1 12:28:59 PDT 2003


jeroen      03/09/01 13:28:59

  Added:       test     manage_news.pl
  Log:
  Add Anderson Lizardo's news archiving script
  
  Revision  Changes    Path
  1.1                  www/test/manage_news.pl
  
  Index: manage_news.pl
  ===================================================================
  #!/usr/bin/perl
  # manage_news.pl - Script for news management: archiving, last news publishing
  
  # Copyright (C) 2003 Anderson Lizardo
  # 
  # This program is free software; you can redistribute it and/or modify
  # it under the terms of the GNU General Public License as published by
  # the Free Software Foundation; either version 2 of the License, or
  # (at your option) any later version.
  #
  # This program is distributed in the hope that it will be useful,
  # but WITHOUT ANY WARRANTY; without even the implied warranty of
  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  # GNU General Public License for more details.
  #
  # You should have received a copy of the GNU General Public License
  # along with this program; if not, write to the Free Software
  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  
  use warnings;
  use strict;
  
  use File::Path qw(mkpath);
  use File::Spec qw(splitpath);
  use Getopt::Long;
  use MIME::Parser;
  use HTML::Parser;
  use URI;
  use Pod::Usage;
  
  my $my_version = "0.1";
  
  my $help = 0;
  my $man = 0;
  my $version = 0;
  my $infile = "";
  my $archive_under = "";
  my $top = "";
  my $bottom = "";
  
  GetOptions(
      "help" => \$help,
      "man" => \$man,
      "version" => \$version,
      "infile|i=s" => \$infile,
      "archive-under|a=s" => \$archive_under,
      "top|t=s" => \$top,
      "bottom|b=s" => \$bottom,
  ) or pod2usage(1);
  
  pod2usage(1) if $help;
  pod2usage(-exitstatus => 0, -verbose => 2) if $man;
  
  if ($version) {
      print "manage_news.pl $my_version\n" .
      "Copyright (C) 2003 Anderson Lizardo\n";
      exit 0;
  }
  
  sub Get_header {
      my ($header, $name) = @_;
      # Extract the field from the header
      my $hdr_content = $header->get($name);
      # Die if the field is not found
      if (!defined($hdr_content)) {
          die "Could not find header field " . $name . "\n" .
          "Header contents:\n" . $header->as_string . "\n";
      }
      $hdr_content =~ s/^\s*//;
      $hdr_content =~ s/\s*$//;
      $hdr_content =~ s/\n$//;
      return $hdr_content;
  }
  
  my $parser = new MIME::Parser;
  # These options disable the use of temporary files
  $parser->output_to_core(1);
  $parser->tmp_to_core(1);
  $parser->use_inner_files(1);
  
  my $buffer = "";
  my $entity;
  
  # Read the input file if one is given; otherwise, read from STDIN
  if ($infile) {
      eval { $entity = $parser->parse_open($infile) } or pod2usage("$0: $@");
  }
  else {
      $entity = $parser->parse(\*STDIN);
  }
  
  my $base_url = "";
  
  if ($archive_under) {
      my %archive_files = ();
      foreach my $part ($entity->parts) {
          # Date field must be in YYYY/MM/DD format
          # It also avoids possible cross-site scripting
          if (Get_header($part->head, "date") !~ /^\d{4}\/\d{2}\/\d{2}$/) {
              die "Invalid date: " . Get_header($part->head, "date") . "\n" .
              "News item header:\n" . $part->head->as_string . "\n";
          }
          # Split year and month, and store the destination archive location
          # ($archive_under/YYYY/MM.html)
          my ($year, $month, undef) = split /\//, Get_header($part->head, "date");
          my $archive_file = $archive_under . "/" . $year . "/" . $month . ".html";
          $archive_files{$archive_file} = 1;
      }
      foreach my $archive_file (keys %archive_files) {
          # Create the destination archive location
          my (undef, $dir, undef) = File::Spec->splitpath($archive_file);
          eval { mkpath($dir) unless -d $dir } or die "Could not create " . $dir . ": $@\n";
          # If a top template is given, prepend it to the output
          if ($top) {
              open(NEWS, ">$archive_file") or die "Could not open $archive_file" . ": $!\n";
              open(TOP, $top) or die "Could not open " . $top . ": $!\n";
              print NEWS <TOP>;
              close TOP;
              close NEWS;
          }
      }
      foreach my $part ($entity->parts) {
          my ($year, $month, undef) = split /\//, Get_header($part->head, "date");
          my $section = lc(Get_header($entity->head, "section"));
          my $archive_file = $archive_under."/".$year."/".$month.".html";
          my $archive_url;
          if ($section =~ /^general$/) {
              $archive_url = "http://linuxfromscratch.org/news/".$year."/".$month.".html";
              $base_url = "http://linuxfromscratch.org/news/";
          }
          else {
              $archive_url = "http://linuxfromscratch.org/news/".$section."/".$year."/".$month.".html";
              $base_url = "http://linuxfromscratch.org/".$section."/";
              $archive_url =~ s/\.\.\///g;
              $base_url =~ s/\.\.\///g;
          }
          open(NEWS, ">>$archive_file") or die "Could not open $archive_file" . ": $!\n";
          # Output the actual XHTML code
          print NEWS "\t<h3 id=\"" . Get_header($part->head, "id") . "\">" .
          "<a href=\"$archive_url#" . Get_header($part->head, "id") . "\">".
          Get_header($part->head, "title") . "</a></h3>\n" .
          "\t\t<h4>" . Get_header($part->head, "author") . " - " .
          Get_header($part->head, "date") . "</h4>\n";
  
          # Use the HTML::Parser and URI modules to resolve relative links
          my $html_p = HTML::Parser->new(api_version => 3,
              start_h => [\&handle_StartTag, "tagname, attr"],
              end_h => [\&handle_EndTag, "tagname"],
              text_h => [\&handle_Text, "dtext" ],
          );
          $html_p->parse($part->bodyhandle->as_string);
          print NEWS $buffer . "\n\n";
          $buffer = "";
          close NEWS;
      }
      foreach my $archive_file (keys %archive_files) {
          # If a bottom template is given, append it to the output
          if ($bottom) {
              open(NEWS, ">>$archive_file") or die "Could not open $archive_file" . ": $!\n";
              open(BOTTOM, $bottom) or die "Could not open " . $bottom . ": $!\n";
              print NEWS <BOTTOM>;
              close BOTTOM;
              close NEWS;
          }
      }
  }
  else {
      my $count = 0;
      if (Get_header($entity->head, "section") =~ /^general$/i) {
          print <<EOF;
  <p><a href="#header">Back to the top.</a></p>
  <h2 id="generalnews">General news</h2>
  EOF
      }
      foreach my $part ($entity->parts) {
          last if $count++ == 5;
          my ($year, $month, undef) = split /\//, Get_header($part->head, "date");
          my $section = lc(Get_header($entity->head, "section"));
          my $archive_url;
          if ($section =~ /^general$/) {
              $archive_url = "http://linuxfromscratch.org/news/".$year."/".$month.".html";
              $base_url = "http://linuxfromscratch.org/news/";
          }
          else {
              $archive_url = "http://linuxfromscratch.org/news/".$section."/".$year."/".$month.".html";
              $base_url = "http://linuxfromscratch.org/".$section."/";
              $archive_url =~ s/\.\.\///g;
              $base_url =~ s/\.\.\///g;
          }
          # Output the actual XHTML code
          print "\t<h3 id=\"" . Get_header($part->head, "id") . "\">" .
          "<a href=\"$archive_url#" . Get_header($part->head, "id") . "\">".
          Get_header($part->head, "title") . "</a></h3>\n" .
          "\t\t<h4>" . Get_header($part->head, "author") . " - " .
          Get_header($part->head, "date") . "</h4>\n";
  
          # Use the HTML::Parser and URI modules to resolve relative links
          my $html_p = HTML::Parser->new(api_version => 3,
              start_h => [\&handle_StartTag, "tagname, attr"],
              end_h => [\&handle_EndTag, "tagname"],
              text_h => [\&handle_Text, "dtext" ],
          );
          $html_p->parse($part->bodyhandle->as_string);
          print $buffer . "\n\n";
          $buffer = "";
      }
      if (Get_header($entity->head, "section") =~ /^general$/i) {
          print <<EOF;
  <p><a href="#header">Back to the top.</a></p>
  <h2 id="changelog">Latest CVS changes:</h2>
  EOF
      }
  }
  
  print STDERR "Errors: " . $parser->results->errors . "\n";
  print "- " . $_ foreach ($parser->results->errors);
  
  print STDERR "Warnings: " . $parser->results->warnings . "\n";
  print "- " . $_ foreach ($parser->results->warnings);
  
  ##########################
  # HTML parser subroutines
  
  sub handle_StartTag {
      my ($tag, $attrs) = @_;
  
      if ($tag eq "a" and $$attrs{"href"}) {
          my $uri = URI->new($$attrs{"href"})->abs($base_url);
          $$attrs{"href"} = $uri->as_string;
      }
      $buffer .= "<$tag";
      $buffer .= " $_=\"$$attrs{$_}\"" foreach (keys %$attrs);
      $buffer .= ">";
  }
  
  sub handle_EndTag {
      my ($tag) = @_;
  
      if ($buffer =~ /<$tag[^>]*>$/) {
          $buffer =~ s/>$/ \/>/;
      }
      else {
          $buffer .= "</$tag>";
      }
  }
  
  sub handle_Text {
      my ($text) = @_;
  
      $text =~ s/\&/\&/g;
      $text =~ s/</\</g;
      $text =~ s/>/\>/g;
      $buffer .= $text;
  }
  
  __END__
  
  =head1 NAME
  
  manage_news.pl - Script for news management: archiving, last news publishing
  
  =head1 SYNOPSIS
  
  manage_news.pl  [--help|--man|--version]  [-t top.html]  [-b bottom.html]  [-a output_dir] [-i html_file]
  
      Options:
          --infile|-i         Parse MIME news database from given file
          --top|-t            Prepend top.html to the output
          --bottom|-b         Append bottom.html to the output
          --archive-under|-a  Output news under output_dir/YYYY/MM.html
          --help              Show brief help message
          --man               Full documentation
          --version           Show program version
  
  =head1 DESCRIPTION
  
  B<manage_news.pl> is a script for news management, including archiving and
  last news publishing. It parses a MIME format news database and extracts news
  items from it.
  
  =head1 OPTIONS
  
  =over
  
  =item B<--infile mime_db>
  
  Specify a MIME news database to parse. By default, B<manage_news.pl> reads
  the MIME database from standard input.
  
  =item B<--top top.html, --bottom bottom.html>
  
  These options are useful for template insertion. C<--top> prepends the
  given file to the output, and C<--bottom> appends it.
  
  =item B<--archive-under output_dir>
  
  Output news under F<output_dir/[section/]YYYY/MM.html>, where YYYY and MM are numeric
  values for year and month, respectively. By default, B<manage_news.pl>
  outputs the five last news to standard output.
  
  =item B<--help>
  
  Print a brief help message and exits.
  
  =item B<--man>
  
  Print the manual page and exits.
  
  =item B<--version>
  
  Show program version and exits.
  
  =back
  
  =head1 TODO
  
  =over
  
  =item * Allow date selection by ranges (like "<2003/08/22" or
  "2003/08/20-2003/08/22")
  
  =back
  
  =head1 AUTHOR
  
  Copyright (C) 2003 Anderson Lizardo <andersonlizardo at yahoo.com.br>
  
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  GNU General Public License for more details.
  
  =cut
  
  
  
  



More information about the website mailing list