Mineplex/.FILES USED TO GET TO WHERE WE ARE PRESENTLY/xampp/perl/bin/xml_spellcheck
Daniel Waggner 76a7ae65df PUUUUUSH
2023-05-17 14:44:01 -07:00

246 lines
6.3 KiB
Perl

#!perl -w
use strict;
use XML::Twig;
use Getopt::Long;
use Pod::Usage;
use File::Temp qw{tempfile};
my $DEFAULT_SC = 'aspell -c';
my $DEFAULT_PP = 'indented';
my $DEFAULT_EXT= '.bak';
my $VERSION="0.02";
my ( $spellchecker, $ext, $attributes, $exclude_elements,
$include_elements, $pretty_print, $version, $help, $man);
GetOptions( 'spellchecker=s' => \$spellchecker,
'backup-extension=s' => \$ext,
'attributes' => \$attributes,
'exclude_elements=s' => \$exclude_elements,
'include_elements=s' => \$include_elements,
'pretty_print:s' => \$pretty_print,
'version' => \$version,
'help' => \$help,
'man' => \$man,
) or pod2usage(-verbose => 1, -exitval => -1);
pod2usage( -verbose => 1, -exitval => 0) if $help;
pod2usage( -verbose => 2, -exitval => 0) if $man;
if( $version) { print "$0 version $VERSION\n"; exit;}
# option processing
$spellchecker ||= $DEFAULT_SC;
$ext ||= $DEFAULT_EXT;
if( $exclude_elements && $include_elements)
{ die "cannot use both --exclude-elements and --include-elements\n"; }
if( defined $pretty_print and !$pretty_print)
{ $pretty_print= $DEFAULT_PP; }
my %twig_options;
my( %include_elements);
if( $exclude_elements)
{ my @exclude_elts = split /\s+/, $exclude_elements;
my %start_tag_handlers= map { $_ => \&exclude_elt } @exclude_elts;
$twig_options{start_tag_handlers}= \%start_tag_handlers;
}
if( $include_elements)
{ my @include_elts = split /\s+/, $include_elements;
my %start_tag_handlers= map { $_ => \&include_elt } @include_elts;
$twig_options{start_tag_handlers}= \%start_tag_handlers;
}
$twig_options{pretty_print}= $pretty_print if( $pretty_print);
foreach my $file (@ARGV)
{
my $id=0;
my $id2elt={}; # id => element
my( $tmp_fh, $tmp_file) = tempfile( "xml_spellcheck_XXXX",
SUFFIX => '.txt'
);
my $t= XML::Twig->new( keep_encoding =>1, %twig_options,);
$t->parsefile( $file);
foreach my $elt ($t->descendants( '#TEXT'))
{
if( (!$include_elements and !$exclude_elements)
or ($include_elements and $elt->inherit_att( '#include'))
or ($exclude_elements and !$elt->inherit_att( '#exclude'))
)
{ $id++;
process_text( $t, $elt, $id, $id2elt, $tmp_fh)
}
}
close $tmp_fh;
system( "$spellchecker $tmp_file") ==0
or die "$spellchecker $tmp_file failed: $?";
open( $tmp_fh, "<$tmp_file") or die "cannot open temp file $tmp_file: $!";
while( <$tmp_fh>)
{ chomp;
my( $id, $text)= split /:/, $_, 2;
my $wrap= $id2elt->{$id};
$text=~ s{<\\n>}{\n}g;
my $text_elt= $wrap->first_child or die "internal error 100\n";
if( $text_elt->gi eq '#PCDATA')
{ $text_elt->set_pcdata( $text); }
elsif( $text_elt->gi eq '#CDATA')
{ $text_elt->set_cdata( $text); }
else
{ die "internal error 101\n"; }
$wrap->erase;
}
close $tmp_fh;
rename( $file, "$file$ext") or die "cannot save backup file $file$ext: $!";
open( FILE, ">$file") or die "cannot save spell checked file $file: $!";
$t->print( \*FILE);
close FILE;
}
sub include_elt
{ $_->set_att( '#include' => 1) ; }
sub exclude_elt
{ $_->set_att( '#exclude' => 1) ; }
sub process_text
{ my( $t, $elt, $id, $id2elt, $tmp_fh)= @_;
my $wrap= $elt->wrap_in( '#SC');
#$wrap->set_att( '#ID' => $id);
$id2elt->{$id}= $wrap;
my $text= $elt->text;
$text=~ s{\n}{<\\n>}g;
print $tmp_fh "$id:$text\n";
}
__END__
=head1 NAME
xml_spellcheck - spellcheck XML files
=head1 SYNOPSIS
xml_spellcheck [options] <files>
=head1 DESCRIPTION
xml_spellcheck lets you spell check the content of an XML file.
It extracts the text (the content of elements and optionally of
attributes), call a spell checker on it and then recreates the
XML document.
=head1 OPTIONS
Note that all options can be abbreviated to the first letter
=over 4
=item --conf <configuration_file>
Gets the options from a configuration file. NOT IMPLEMENTED YET.
=item --spellchecker <spellchecker>
The command to use for spell checking, including any option
By default C<aspell -c> is used
=item --backup-extension <extension>
By default the original file is saved with a C<.bak> extension. This option
changes the extension
=item --attributes
Spell check attribute content. By default attribute values are NOT
spell checked. NOT YET IMPLEMENTED
=item --exclude_elements <list_of_excluded_elements>
A list of elements that should not be spell checked
=item --include_elements <list_of_included_elements>
A list of elements that should be spell checked (by default all elements
are spell checked).
C<--exclude_elements> and C<--include_elements> are mutually exclusive
=item --pretty_print <optional_pretty_print_style>
A pretty print style for the document, as defined in XML::Twig. If
the option is provided without a value then the C<indented> style is
used
=item --version
Dislay the tool version and exit
=item --help
Display help message and exit
=item --man
Display longer help message and exit
=back
=head1 EXAMPLES
=head1 BUGS
=head1 TODO
=over 4
=item --conf option
=item --attribute option
=back
=head1 PRE-REQUISITE
XML::Twig, Getopt::Long, Pod::Usage, File::Temp
XML::Twig requires XML::Parser.
=head1 SEE ALSO
XML::Twig
=head1 COPYRIGHT AND DISCLAIMER
This program is Copyright 2003 by Michel Rodriguez
This program is free software; you can redistribute it and/or modify
it under the terms of the Perl Artistic License or the GNU General
Public License as published by the Free Software Foundation either
version 2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MER-
CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.
If you do not have a copy of the GNU General Public License write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
USA.
=head1 AUTHOR
Michel Rodriguez <mirod@xmltwig.com>
xml_spellcheck is available at http://www.xmltwig.com/xmltwig/