Aug 4th, 2003 under Programming Perl Gist
Converts text files to properly encoded HTML form.
#!/usr/bin/perl #!/usr/local/bin/perl # Copyright (C) 1998 Andrew Loree # $Id: txt2html.pl,v 1.1.1.1 2003/10/05 21:51:29 andy Exp $ ############################################################################ # # txt2html - Converts ascii text file to HTML format. It does so through # converting html entities i.e. &, <, >, " to their encoded # equivalients. # ############################################################################ # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ############################################################################# # Version History: # 1.0 - Initial Release ############################################################################# $ver = "1.0"; use Getopt::Std; # cmd-line prcessing my(%replace_hash); # For characters beyond ASCII val of 0x7f # Get the cmd-line options getopts('hbpec'); if (!@ARGV || ($opt_b && $opt_p) || (($opt_p || $opt_b) && !$opt_h)){# Check arguments print <<END_of_usage; usage: txt2html [-h [-b | -p]] [-e] [-c] files Version $ver Formats plain ascii text files to be \"web ready\" by converting <,>,\", & , and extended characters to their correct HTML format. The original file is copied file.bak -h Add HTML head and footer information, with the text as pre-formated data <PRE>. -b If option -h is specified, text will not be considered pre-formated <PRE>, and all line break in the text will be rendered as <BR>. -p If option -h is specified, text will not be considered pre-formated <PRE>, and try to guess paragraphs replacing them with a <P>. -e Remove all extensions from the file, and add .html at the end -c Convert all extended characters to their hex character reference. Copyright (c) 1998, Andrew Loree END_of_usage exit(-1); } &main; sub main{ foreach $filename (@ARGV){ # Process the files if (-f $filename){ rename ($filename, "$filename.bak"); make_valid_chars("$filename.bak",">$filename",$opt_c); # Convert characters if ($opt_h){ $temp_file = "/tmp/txt2html.$$"; make_html ($filename,">$temp_file",$opt_p,$opt_b); # Add html stuff rename ("$temp_file","$filename"); } if ($opt_e){ # Drop all ext add html if ( $filename !~ /\./ ){ # No dot $new_filename = $filename . ".html"; } else{ $filename =~ s/\.(.*)/\.html/; } rename ($filename,$new_filename); } } else{ print "Invalid filename $filename\n"; } } } # $_[0] - input file, $_[1] - Output file # [$_[2]] - optionaly weather to try to guess paragraphs # [$_[3]] - optionaly replace all line breaks with <BR> # Add html header, footer information, and optionally linebreaking sub make_html{ open (INFILE,$_[0]) || die "Unable to open the input file $_[0]\n$!\n"; open (OUTFILE, $_[1]) || die "Unable to open the output file named $_[1]\n$!\n"; # Print html header print OUTFILE <<END_of_header; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> <HTML> <HEAD> </HEAD> <BODY> END_of_header if ((!$_[2]) && (!$_[3])){ # Pre-formated text print OUTFILE "<PRE>\n"; while(<INFILE>){ # Copy the text print OUTFILE $_; } print OUTFILE "</PRE>\n"; } else{ # Non-pre-formated while(<INFILE>){ $curr_line = $_; if ($_[2]){ # <P><BR> mode if ($curr_line eq "\n"){ $p_count++; if ($p_count == 1){ $curr_line = "<P>\n"; } if ($p_count > 2){ $curr_line = "<BR>\n"; } } else{ $p_count = 0; } } if($_[3]){ # <BR> mode $curr_line =~ s/\n/<BR>\n/; } print OUTFILE $curr_line; # Write the output } } print OUTFILE "</BODY>\n</HTML>"; # Write footer info close (INFILE); close (OUTFILE); } # $_[0] - input file, $_[1] - output file # [$_[2]] - Optionaly, all extended characters are converted to their # hex character reference, otherwise prompt for replacement # string # Converts all & to & # < to < # > to > # " to " sub make_valid_chars { local($current_line); open (INFILE,$_[0]) || die "Unable to open the input file $_[0]\n$!\n"; open (OUTFILE, $_[1]) || die "Unable to open the output file named $_[1]\n$!\n"; while(<INFILE>){ # Process the file $current_line = $_; $current_line =~ s/\s*$/\n/; # Strip right whitespaces $current_line =~ s/&/&\;/g; # Replace the ampersands $current_line =~ s/</<\;/g; # Less than $current_line =~ s/>/>\;/g; # Greater than $current_line =~ s/\"/"\;/g; # Quotes if ($_[2]){ to_ascii($current_line); # Check for character beyond 0x7f } else{ to_hex($current_line); # Convert to hex reference } print OUTFILE $current_line; # Write the output } close (INFILE); close (OUTFILE); } # Checks $_[0] for characters beyond 0x7f # If there are any, they are converted to their hex # chracter reference e.g.  sub to_hex{ local($temp); local(@chars); local($replacement = 0); local($curr_char); local($char_val); if ($_[0] =~ /[\x80-\xff]/){ # Check for any chars above 0x7f $temp = $_[0]; $temp =~ s/[\x00-\x7f]//g; # Strip all others @chars = unpack("C*",$temp); # Break each char in to a list foreach $curr_val (@chars){ $replacement = sprintf("%lx",$curr_val); # Get the hex value $_[0] =~ s/[\x$replacement]/\&\#x$replacement\;/g; } } } # Checks $_[0] for character beyond 0x7f # If there are any, %replace_hash is first checked for # previous input, else it prompts for an approiate action sub to_ascii{ local($temp); local(@chars); local($replacement); local($curr_char); local($char_val); if ($_[0] =~ /[\x80-\xff]/){ # Check for any chars above 0x7f $temp = $_[0]; $temp =~ s/[\x00-\x7f]//g; # Strip all other character @chars = unpack("C*",$temp); # Break each char in to a list foreach $curr_val (@chars){ $curr_char = chr ($curr_val); if ($replace_hash{$curr_val}){ # Search for previous answer in replace_hash $replacement = $replace_hash{$curr_val}; } else{ # Get one, and save it print "Invalid ASCII character found\n"; print "ASCII($curr_val) = $curr_char\n"; print "What would you like to replace it with? "; $replacement = <STDIN>; chomp($replacement); $replace_hash{$curr_val} = $replacement; # Save the replacement } $_[0] =~ s/$curr_char/$replacement/g; } } }