Convert Text File To Html Txt2html

Aug 4th, 2003 under Programming Perl Gist
Converts text files to properly encoded HTML form.
#!/usr/bin/perl
#!/usr/local/bin/perl
# Copyright (C) 1998 Andrew Loree
# $Id: txt2html.pl,v 1.1.1.1 2003/10/05 21:51:29 andy Exp $
############################################################################
#
#  txt2html - Converts ascii text file to HTML format. It does so through
#             converting html entities i.e. &, <, >, " to their encoded
#             equivalients.
#
############################################################################
#
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU General Public License
#  as published by the Free Software Foundation; either version 2
#  of the License, or (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#############################################################################
# Version History:
#   1.0  - Initial Release
#############################################################################
$ver = "1.0";
use Getopt::Std;           # cmd-line prcessing
my(%replace_hash);         # For characters beyond ASCII val of 0x7f


# Get the cmd-line options
getopts('hbpec');

if (!@ARGV || ($opt_b && $opt_p) || (($opt_p || $opt_b) && !$opt_h)){# Check  arguments
  print <<END_of_usage;
usage: txt2html [-h [-b | -p]] [-e] [-c] files
       Version $ver
Formats plain ascii text files to be \"web ready\" by converting <,>,\", &
, and extended characters to their correct HTML format. The original file
is copied file.bak

 -h     Add HTML head and footer information, with the text as
        pre-formated data <PRE>.
 -b     If option -h is specified, text will not be considered
        pre-formated <PRE>, and all line break in the text
        will be rendered as <BR>.
 -p     If option -h is specified, text will not be considered
        pre-formated <PRE>, and try to guess paragraphs replacing
        them with a <P>.
 -e     Remove all extensions from the file, and add .html at the end
 -c     Convert all extended characters to their hex character
        reference.

Copyright (c) 1998, Andrew Loree
END_of_usage
  exit(-1);
}

&main;


sub main{
  foreach $filename (@ARGV){                                     # Process the files
    if (-f $filename){
      rename ($filename, "$filename.bak");
      make_valid_chars("$filename.bak",">$filename",$opt_c);     # Convert characters
      if ($opt_h){
	$temp_file = "/tmp/txt2html.$$";
	make_html ($filename,">$temp_file",$opt_p,$opt_b);       # Add html stuff
	rename ("$temp_file","$filename");
      }
      if ($opt_e){                                               # Drop all ext add html 
	if ( $filename !~ /\./ ){                                # No dot
	  $new_filename = $filename . ".html";
	}
	else{
	  $filename =~ s/\.(.*)/\.html/;
	}
	rename ($filename,$new_filename);
      }
    }
    else{
      print "Invalid filename $filename\n";
    }
  }
}

# $_[0] - input file, $_[1] - Output file
# [$_[2]] - optionaly weather to try to guess paragraphs
# [$_[3]] - optionaly replace all line breaks with <BR>
# Add html header, footer information, and optionally linebreaking
sub make_html{

  open (INFILE,$_[0]) || 
  die "Unable to open the input file $_[0]\n$!\n";
  open (OUTFILE, $_[1]) ||
  die "Unable to open the output file named $_[1]\n$!\n";

  # Print html header
  print OUTFILE <<END_of_header;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML>
<HEAD>
</HEAD>
<BODY>
END_of_header

  if ((!$_[2]) && (!$_[3])){                  # Pre-formated text
    print OUTFILE "<PRE>\n";
    while(<INFILE>){                          # Copy the text
      print OUTFILE $_;                 
    }    
    print OUTFILE "</PRE>\n";
  }
  else{                                       # Non-pre-formated
    while(<INFILE>){  
      $curr_line = $_;
      if ($_[2]){                            # <P><BR> mode
        if ($curr_line eq "\n"){
	  $p_count++;
	  if ($p_count == 1){   
	    $curr_line = "<P>\n";
	  }
	  if ($p_count > 2){
	    $curr_line = "<BR>\n";
	  }
	}
	else{
	  $p_count = 0;
	}
      }
      if($_[3]){                              # <BR> mode
	$curr_line =~ s/\n/<BR>\n/;
      }
      print OUTFILE $curr_line;               # Write the output
    }
  }

  print OUTFILE "</BODY>\n</HTML>";           # Write footer info

  close (INFILE);
  close (OUTFILE);
}


# $_[0]   - input file, $_[1] - output file
# [$_[2]] - Optionaly, all extended characters are converted to their
#           hex character reference, otherwise prompt for replacement
#           string
# Converts all & to &amp;
#              < to &lt;
#              > to &gt;
#              " to &quot;
sub make_valid_chars {
local($current_line);
  open (INFILE,$_[0]) || 
  die "Unable to open the input file $_[0]\n$!\n";
  open (OUTFILE, $_[1]) ||
  die "Unable to open the output file named $_[1]\n$!\n";

  while(<INFILE>){                          # Process the file
    $current_line = $_;
    $current_line =~ s/\s*$/\n/;            # Strip right whitespaces
    $current_line =~ s/&/&amp\;/g;          # Replace the ampersands
    $current_line =~ s/</&lt\;/g;           # Less than
    $current_line =~ s/>/&gt\;/g;           # Greater than
    $current_line =~ s/\"/&quot\;/g;        # Quotes
    if ($_[2]){
      to_ascii($current_line);              # Check for character beyond 0x7f
    }
    else{
      to_hex($current_line);                # Convert to hex reference
    }
    print OUTFILE $current_line;            # Write the output
  }

  close (INFILE);
  close (OUTFILE);
}


# Checks $_[0] for characters beyond 0x7f
# If there are any, they are converted to their hex
# chracter reference e.g. &#x7f
sub to_hex{
local($temp);
local(@chars);
local($replacement = 0);
local($curr_char);
local($char_val);

  if ($_[0] =~ /[\x80-\xff]/){            # Check for any chars above 0x7f
    $temp = $_[0];
    $temp =~ s/[\x00-\x7f]//g;            # Strip all others
    @chars = unpack("C*",$temp);          # Break each char in to a list
    foreach $curr_val (@chars){
      $replacement = sprintf("%lx",$curr_val); # Get the hex value
      $_[0] =~ s/[\x$replacement]/\&\#x$replacement\;/g;
    }
  }
}

# Checks $_[0] for character beyond 0x7f
# If there are any, %replace_hash is first checked for
# previous input, else it prompts for an approiate action
sub to_ascii{
local($temp);
local(@chars);
local($replacement);
local($curr_char);
local($char_val);

  if ($_[0] =~ /[\x80-\xff]/){            # Check for any chars above 0x7f
    $temp = $_[0];
    $temp =~ s/[\x00-\x7f]//g;            # Strip all other character
    @chars = unpack("C*",$temp);          # Break each char in to a list
    foreach $curr_val (@chars){
      $curr_char = chr ($curr_val);
      if ($replace_hash{$curr_val}){      # Search for previous answer in replace_hash
	$replacement = $replace_hash{$curr_val};
      }
      else{ # Get one, and save it
	print "Invalid ASCII character found\n";
	print "ASCII($curr_val) = $curr_char\n";
	print "What would you like to replace it with? ";
	$replacement = <STDIN>;
	chomp($replacement);
	$replace_hash{$curr_val} = $replacement;  # Save the replacement
      }
      $_[0] =~ s/$curr_char/$replacement/g;
    }
  }
}
Andy Loree

Randomness

Home

Running

About

Convert Text File To Html Txt2html