#!/usr/pkg/bin/perl # use strict; use warnings; my $infile = shift; my $outfile = shift; $infile = "-" unless $infile; $outfile = "STDOUT" unless $outfile; my $text_ref = _slurp_file ($infile); # Strip HTML in 3 stages, from http://www.perlmonks.org/?node_id=46815 $$text_ref =~ s///g; while ( $$text_ref =~ s/<(?!--)[^'">]*"[^"]*"/]*'[^']*'/]*>//g; _burp_file ($outfile, $text_ref); sub _slurp_file { my $infile = shift; open( my $fh, $infile ) or die "Unable to open $infile in _slurp_file: $!\n"; my $text = do { local( $/ ) ; <$fh> } ; return \$text; } sub _burp_file { my $outfile = shift; my $text_ref = shift; if ($outfile eq "STDOUT") { print $$text_ref; } else { open( my $fh, ">$outfile" ) or die "Unable to open $outfile in _burp_file: $!\n" ; print $fh $$text_ref ; } } =head1 SYNOPSIS unhtml is a perl script that strips HTML tags from text. =head1 VERSION This documentation describes version 1.3 of unhtml =head1 DESCRIPTION Uses a few regexes to do the real work of stripping HTML tags; this is not the best solution, but works in most cases, and is free of any module dependencies. You can specify command line file arguments - standard input/output is assumed if no args are given. If only one arg is given, it is assumed to be the input pathname. =head1 USAGE Examples (the following have equivalent results): =over 4 =item unhtml < foo.html > foo.txt =item unhtml foo.html > foo.txt =item unhtml foo.html foo.txt =back =head1 REQUIRED ARGUMENTS None. Acts as a STDIN/STDOUT pipe with no arguments. =head1 OPTIONS None. =head1 LICENSE Copyright (c) 2010, 2011 slugmax@sdf.org This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . =cut