Aug 4th, 2003 under Programming Perl Gist
Converts text files to properly encoded HTML form.
#!/usr/bin/perl
#!/usr/local/bin/perl
# Copyright (C) 1998 Andrew Loree
# $Id: txt2html.pl,v 1.1.1.1 2003/10/05 21:51:29 andy Exp $
############################################################################
#
# txt2html - Converts ascii text file to HTML format. It does so through
# converting html entities i.e. &, <, >, " to their encoded
# equivalients.
#
############################################################################
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
#############################################################################
# Version History:
# 1.0 - Initial Release
#############################################################################
$ver = "1.0";
use Getopt::Std; # cmd-line prcessing
my(%replace_hash); # For characters beyond ASCII val of 0x7f
# Get the cmd-line options
getopts('hbpec');
if (!@ARGV || ($opt_b && $opt_p) || (($opt_p || $opt_b) && !$opt_h)){# Check arguments
print <<END_of_usage;
usage: txt2html [-h [-b | -p]] [-e] [-c] files
Version $ver
Formats plain ascii text files to be \"web ready\" by converting <,>,\", &
, and extended characters to their correct HTML format. The original file
is copied file.bak
-h Add HTML head and footer information, with the text as
pre-formated data <PRE>.
-b If option -h is specified, text will not be considered
pre-formated <PRE>, and all line break in the text
will be rendered as <BR>.
-p If option -h is specified, text will not be considered
pre-formated <PRE>, and try to guess paragraphs replacing
them with a <P>.
-e Remove all extensions from the file, and add .html at the end
-c Convert all extended characters to their hex character
reference.
Copyright (c) 1998, Andrew Loree
END_of_usage
exit(-1);
}
&main;
sub main{
foreach $filename (@ARGV){ # Process the files
if (-f $filename){
rename ($filename, "$filename.bak");
make_valid_chars("$filename.bak",">$filename",$opt_c); # Convert characters
if ($opt_h){
$temp_file = "/tmp/txt2html.$$";
make_html ($filename,">$temp_file",$opt_p,$opt_b); # Add html stuff
rename ("$temp_file","$filename");
}
if ($opt_e){ # Drop all ext add html
if ( $filename !~ /\./ ){ # No dot
$new_filename = $filename . ".html";
}
else{
$filename =~ s/\.(.*)/\.html/;
}
rename ($filename,$new_filename);
}
}
else{
print "Invalid filename $filename\n";
}
}
}
# $_[0] - input file, $_[1] - Output file
# [$_[2]] - optionaly weather to try to guess paragraphs
# [$_[3]] - optionaly replace all line breaks with <BR>
# Add html header, footer information, and optionally linebreaking
sub make_html{
open (INFILE,$_[0]) ||
die "Unable to open the input file $_[0]\n$!\n";
open (OUTFILE, $_[1]) ||
die "Unable to open the output file named $_[1]\n$!\n";
# Print html header
print OUTFILE <<END_of_header;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML>
<HEAD>
</HEAD>
<BODY>
END_of_header
if ((!$_[2]) && (!$_[3])){ # Pre-formated text
print OUTFILE "<PRE>\n";
while(<INFILE>){ # Copy the text
print OUTFILE $_;
}
print OUTFILE "</PRE>\n";
}
else{ # Non-pre-formated
while(<INFILE>){
$curr_line = $_;
if ($_[2]){ # <P><BR> mode
if ($curr_line eq "\n"){
$p_count++;
if ($p_count == 1){
$curr_line = "<P>\n";
}
if ($p_count > 2){
$curr_line = "<BR>\n";
}
}
else{
$p_count = 0;
}
}
if($_[3]){ # <BR> mode
$curr_line =~ s/\n/<BR>\n/;
}
print OUTFILE $curr_line; # Write the output
}
}
print OUTFILE "</BODY>\n</HTML>"; # Write footer info
close (INFILE);
close (OUTFILE);
}
# $_[0] - input file, $_[1] - output file
# [$_[2]] - Optionaly, all extended characters are converted to their
# hex character reference, otherwise prompt for replacement
# string
# Converts all & to &
# < to <
# > to >
# " to "
sub make_valid_chars {
local($current_line);
open (INFILE,$_[0]) ||
die "Unable to open the input file $_[0]\n$!\n";
open (OUTFILE, $_[1]) ||
die "Unable to open the output file named $_[1]\n$!\n";
while(<INFILE>){ # Process the file
$current_line = $_;
$current_line =~ s/\s*$/\n/; # Strip right whitespaces
$current_line =~ s/&/&\;/g; # Replace the ampersands
$current_line =~ s/</<\;/g; # Less than
$current_line =~ s/>/>\;/g; # Greater than
$current_line =~ s/\"/"\;/g; # Quotes
if ($_[2]){
to_ascii($current_line); # Check for character beyond 0x7f
}
else{
to_hex($current_line); # Convert to hex reference
}
print OUTFILE $current_line; # Write the output
}
close (INFILE);
close (OUTFILE);
}
# Checks $_[0] for characters beyond 0x7f
# If there are any, they are converted to their hex
# chracter reference e.g. 
sub to_hex{
local($temp);
local(@chars);
local($replacement = 0);
local($curr_char);
local($char_val);
if ($_[0] =~ /[\x80-\xff]/){ # Check for any chars above 0x7f
$temp = $_[0];
$temp =~ s/[\x00-\x7f]//g; # Strip all others
@chars = unpack("C*",$temp); # Break each char in to a list
foreach $curr_val (@chars){
$replacement = sprintf("%lx",$curr_val); # Get the hex value
$_[0] =~ s/[\x$replacement]/\&\#x$replacement\;/g;
}
}
}
# Checks $_[0] for character beyond 0x7f
# If there are any, %replace_hash is first checked for
# previous input, else it prompts for an approiate action
sub to_ascii{
local($temp);
local(@chars);
local($replacement);
local($curr_char);
local($char_val);
if ($_[0] =~ /[\x80-\xff]/){ # Check for any chars above 0x7f
$temp = $_[0];
$temp =~ s/[\x00-\x7f]//g; # Strip all other character
@chars = unpack("C*",$temp); # Break each char in to a list
foreach $curr_val (@chars){
$curr_char = chr ($curr_val);
if ($replace_hash{$curr_val}){ # Search for previous answer in replace_hash
$replacement = $replace_hash{$curr_val};
}
else{ # Get one, and save it
print "Invalid ASCII character found\n";
print "ASCII($curr_val) = $curr_char\n";
print "What would you like to replace it with? ";
$replacement = <STDIN>;
chomp($replacement);
$replace_hash{$curr_val} = $replacement; # Save the replacement
}
$_[0] =~ s/$curr_char/$replacement/g;
}
}
}