4487 lines
139 KiB
Perl
4487 lines
139 KiB
Perl
package HTML::Element;
|
|
|
|
# ABSTRACT: Class for objects that represent HTML elements
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
our $VERSION = '5.07'; # VERSION from OurPkgVersion
|
|
|
|
use Carp ();
|
|
use HTML::Entities ();
|
|
use HTML::Tagset ();
|
|
use integer; # vroom vroom!
|
|
|
|
# This controls encoding entities on output.
|
|
# When set entities won't be re-encoded.
|
|
# Defaulting off because parser defaults to unencoding entities
|
|
our $encoded_content = 0;
|
|
|
|
use vars qw($html_uc $Debug $ID_COUNTER $VERSION %list_type_to_sub);
|
|
|
|
# Set up support for weak references, if possible:
|
|
my $using_weaken;
|
|
|
|
#=head1 CLASS METHODS
|
|
|
|
|
|
sub Use_Weak_Refs {
|
|
my $self_or_class = shift;
|
|
|
|
if (@_) { # set
|
|
$using_weaken = !! shift; # Normalize boolean value
|
|
Carp::croak("The installed Scalar::Util lacks support for weak references")
|
|
if $using_weaken and not defined &Scalar::Util::weaken;
|
|
|
|
no warnings 'redefine';
|
|
*_weaken = $using_weaken ? \&Scalar::Util::weaken : sub ($) {};
|
|
} # end if setting value
|
|
|
|
return $using_weaken;
|
|
} # end Use_Weak_Refs
|
|
|
|
BEGIN {
|
|
# Attempt to import weaken from Scalar::Util, but don't complain
|
|
# if we can't. Also, rename it to _weaken.
|
|
require Scalar::Util;
|
|
|
|
__PACKAGE__->Use_Weak_Refs(defined &Scalar::Util::weaken);
|
|
}
|
|
|
|
sub import {
|
|
my $class = shift;
|
|
|
|
for (@_) {
|
|
if (/^-(no_?)?weak$/) {
|
|
$class->Use_Weak_Refs(not $1);
|
|
} else {
|
|
Carp::croak("$_ is not exported by the $class module");
|
|
}
|
|
}
|
|
} # end import
|
|
|
|
|
|
$Debug = 0 unless defined $Debug;
|
|
|
|
#=head1 SUBROUTINES
|
|
|
|
|
|
sub Version {
|
|
Carp::carp("Deprecated subroutine HTML::Element::Version called");
|
|
$VERSION;
|
|
}
|
|
|
|
my $nillio = [];
|
|
|
|
*HTML::Element::emptyElement = \%HTML::Tagset::emptyElement; # legacy
|
|
*HTML::Element::optionalEndTag = \%HTML::Tagset::optionalEndTag; # legacy
|
|
*HTML::Element::linkElements = \%HTML::Tagset::linkElements; # legacy
|
|
*HTML::Element::boolean_attr = \%HTML::Tagset::boolean_attr; # legacy
|
|
*HTML::Element::canTighten = \%HTML::Tagset::canTighten; # legacy
|
|
|
|
# Constants for signalling back to the traverser:
|
|
my $travsignal_package = __PACKAGE__ . '::_travsignal';
|
|
my ( $ABORT, $PRUNE, $PRUNE_SOFTLY, $OK, $PRUNE_UP )
|
|
= map { my $x = $_; bless \$x, $travsignal_package; }
|
|
qw(
|
|
ABORT PRUNE PRUNE_SOFTLY OK PRUNE_UP
|
|
);
|
|
|
|
|
|
## Comments from Father Chrysostomos RT #58880
|
|
## The sole purpose for empty parentheses after a sub name is to make it
|
|
## parse as a 0-ary (nihilary?) function. I.e., ABORT+1 should parse as
|
|
## ABORT()+1, not ABORT(+1). The parentheses also tell perl that it can
|
|
### be inlined.
|
|
##Deparse is really useful for demonstrating this:
|
|
##$ perl -MO=Deparse,-p -e 'sub ABORT {7} print ABORT+8'
|
|
# Vs
|
|
# perl -MO=Deparse,-p -e 'sub ABORT() {7} print ABORT+8'
|
|
#
|
|
# With the parentheses, it not only makes it parse as a term.
|
|
# It even resolves the constant at compile-time, making the code run faster.
|
|
|
|
## no critic
|
|
sub ABORT () {$ABORT}
|
|
sub PRUNE () {$PRUNE}
|
|
sub PRUNE_SOFTLY () {$PRUNE_SOFTLY}
|
|
sub OK () {$OK}
|
|
sub PRUNE_UP () {$PRUNE_UP}
|
|
## use critic
|
|
|
|
$html_uc = 0;
|
|
|
|
# set to 1 if you want tag and attribute names from starttag and endtag
|
|
# to be uc'd
|
|
|
|
# regexs for XML names
|
|
# http://www.w3.org/TR/2006/REC-xml11-20060816/NT-NameStartChar
|
|
my $START_CHAR
|
|
= qr/(?:\:|[A-Z]|_|[a-z]|[\x{C0}-\x{D6}]|[\x{D8}-\x{F6}]|[\x{F8}-\x{2FF}]|[\x{370}-\x{37D}]|[\x{37F}-\x{1FFF}]|[\x{200C}-\x{200D}]|[\x{2070}-\x{218F}]|[\x{2C00}-\x{2FEF}]|[\x{3001}-\x{D7FF}]|[\x{F900}-\x{FDCF}]|[\x{FDF0}-\x{FFFD}]|[\x{10000}-\x{EFFFF}])/;
|
|
|
|
# http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameChar
|
|
my $NAME_CHAR
|
|
= qr/(?:$START_CHAR|-|\.|[0-9]|\x{B7}|[\x{0300}-\x{036F}]|[\x{203F}-\x{2040}])/;
|
|
|
|
# Elements that does not have corresponding end tags (i.e. are empty)
|
|
|
|
#==========================================================================
|
|
|
|
#=head1 BASIC METHODS
|
|
|
|
|
|
#
|
|
# An HTML::Element is represented by blessed hash reference, much like
|
|
# Tree::DAG_Node objects. Key-names not starting with '_' are reserved
|
|
# for the SGML attributes of the element.
|
|
# The following special keys are used:
|
|
#
|
|
# '_tag': The tag name (i.e., the generic identifier)
|
|
# '_parent': A reference to the HTML::Element above (when forming a tree)
|
|
# '_pos': The current position (a reference to a HTML::Element) is
|
|
# where inserts will be placed (look at the insert_element
|
|
# method) If not set, the implicit value is the object itself.
|
|
# '_content': A ref to an array of nodes under this.
|
|
# It might not be set.
|
|
#
|
|
# Example: <img src="gisle.jpg" alt="Gisle's photo"> is represented like this:
|
|
#
|
|
# bless {
|
|
# _tag => 'img',
|
|
# src => 'gisle.jpg',
|
|
# alt => "Gisle's photo",
|
|
# }, 'HTML::Element';
|
|
#
|
|
|
|
sub new {
|
|
my $class = shift;
|
|
$class = ref($class) || $class;
|
|
|
|
my $tag = shift;
|
|
Carp::croak("No tagname") unless defined $tag and length $tag;
|
|
Carp::croak "\"$tag\" isn't a good tag name!"
|
|
if $tag =~ m/[<>\/\x00-\x20]/; # minimal sanity, certainly!
|
|
my $self = bless { _tag => scalar( $class->_fold_case($tag) ) }, $class;
|
|
my ( $attr, $val );
|
|
while ( ( $attr, $val ) = splice( @_, 0, 2 ) ) {
|
|
## RT #42209 why does this default to the attribute name and not remain unset or the empty string?
|
|
$val = $attr unless defined $val;
|
|
$self->{ $class->_fold_case($attr) } = $val;
|
|
}
|
|
if ( $tag eq 'html' ) {
|
|
$self->{'_pos'} = undef;
|
|
}
|
|
_weaken($self->{'_parent'}) if $self->{'_parent'};
|
|
return $self;
|
|
}
|
|
|
|
|
|
sub attr {
|
|
my $self = shift;
|
|
my $attr = scalar( $self->_fold_case(shift) );
|
|
if (@_) { # set
|
|
if ( defined $_[0] ) {
|
|
my $old = $self->{$attr};
|
|
$self->{$attr} = $_[0];
|
|
return $old;
|
|
}
|
|
else { # delete, actually
|
|
return delete $self->{$attr};
|
|
}
|
|
}
|
|
else { # get
|
|
return $self->{$attr};
|
|
}
|
|
}
|
|
|
|
|
|
sub tag {
|
|
my $self = shift;
|
|
if (@_) { # set
|
|
$self->{'_tag'} = $self->_fold_case( $_[0] );
|
|
}
|
|
else { # get
|
|
$self->{'_tag'};
|
|
}
|
|
}
|
|
|
|
|
|
sub parent {
|
|
my $self = shift;
|
|
if (@_) { # set
|
|
Carp::croak "an element can't be made its own parent"
|
|
if defined $_[0] and ref $_[0] and $self eq $_[0]; # sanity
|
|
_weaken($self->{'_parent'} = $_[0]);
|
|
}
|
|
else {
|
|
$self->{'_parent'}; # get
|
|
}
|
|
}
|
|
|
|
|
|
sub content_list {
|
|
return wantarray
|
|
? @{ shift->{'_content'} || return () }
|
|
: scalar @{ shift->{'_content'} || return 0 };
|
|
}
|
|
|
|
|
|
# a read-only method! can't say $h->content( [] )!
|
|
sub content {
|
|
return shift->{'_content'};
|
|
}
|
|
|
|
|
|
sub content_array_ref {
|
|
return shift->{'_content'} ||= [];
|
|
}
|
|
|
|
|
|
sub content_refs_list {
|
|
return \( @{ shift->{'_content'} || return () } );
|
|
}
|
|
|
|
|
|
sub implicit {
|
|
return shift->attr( '_implicit', @_ );
|
|
}
|
|
|
|
|
|
sub pos {
|
|
my $self = shift;
|
|
my $pos = $self->{'_pos'};
|
|
if (@_) { # set
|
|
my $parm = shift;
|
|
if ( defined $parm and $parm ne $self ) {
|
|
$self->{'_pos'} = $parm; # means that element
|
|
}
|
|
else {
|
|
$self->{'_pos'} = undef; # means $self
|
|
}
|
|
}
|
|
return $pos if defined($pos);
|
|
return $self;
|
|
}
|
|
|
|
|
|
sub all_attr {
|
|
return %{ $_[0] };
|
|
|
|
# Yes, trivial. But no other way for the user to do the same
|
|
# without breaking encapsulation.
|
|
# And if our object representation changes, this method's behavior
|
|
# should stay the same.
|
|
}
|
|
|
|
sub all_attr_names {
|
|
return keys %{ $_[0] };
|
|
}
|
|
|
|
|
|
sub all_external_attr {
|
|
my $self = $_[0];
|
|
return map( ( length($_) && substr( $_, 0, 1 ) eq '_' )
|
|
? ()
|
|
: ( $_, $self->{$_} ),
|
|
keys %$self );
|
|
}
|
|
|
|
sub all_external_attr_names {
|
|
return grep !( length($_) && substr( $_, 0, 1 ) eq '_' ), keys %{ $_[0] };
|
|
}
|
|
|
|
|
|
sub id {
|
|
if ( @_ == 1 ) {
|
|
return $_[0]{'id'};
|
|
}
|
|
elsif ( @_ == 2 ) {
|
|
if ( defined $_[1] ) {
|
|
return $_[0]{'id'} = $_[1];
|
|
}
|
|
else {
|
|
return delete $_[0]{'id'};
|
|
}
|
|
}
|
|
else {
|
|
Carp::croak '$node->id can\'t take ' . scalar(@_) . ' parameters!';
|
|
}
|
|
}
|
|
|
|
|
|
sub _gensym {
|
|
unless ( defined $ID_COUNTER ) {
|
|
|
|
# start it out...
|
|
$ID_COUNTER = sprintf( '%04x', rand(0x1000) );
|
|
$ID_COUNTER =~ tr<0-9a-f><J-NP-Z>; # yes, skip letter "oh"
|
|
$ID_COUNTER .= '00000';
|
|
}
|
|
++$ID_COUNTER;
|
|
}
|
|
|
|
sub idf {
|
|
my $nparms = scalar @_;
|
|
|
|
if ( $nparms == 1 ) {
|
|
my $x;
|
|
if ( defined( $x = $_[0]{'id'} ) and length $x ) {
|
|
return $x;
|
|
}
|
|
else {
|
|
return $_[0]{'id'} = _gensym();
|
|
}
|
|
}
|
|
if ( $nparms == 2 ) {
|
|
if ( defined $_[1] ) {
|
|
return $_[0]{'id'} = $_[1];
|
|
}
|
|
else {
|
|
return delete $_[0]{'id'};
|
|
}
|
|
}
|
|
Carp::croak '$node->idf can\'t take ' . scalar(@_) . ' parameters!';
|
|
}
|
|
|
|
|
|
sub push_content {
|
|
my $self = shift;
|
|
return $self unless @_;
|
|
|
|
my $content = ( $self->{'_content'} ||= [] );
|
|
for (@_) {
|
|
if ( ref($_) eq 'ARRAY' ) {
|
|
|
|
# magically call new_from_lol
|
|
push @$content, $self->new_from_lol($_);
|
|
_weaken($content->[-1]->{'_parent'} = $self);
|
|
}
|
|
elsif ( ref($_) ) { # insert an element
|
|
$_->detach if $_->{'_parent'};
|
|
_weaken($_->{'_parent'} = $self);
|
|
push( @$content, $_ );
|
|
}
|
|
else { # insert text segment
|
|
if ( @$content && !ref $content->[-1] ) {
|
|
|
|
# last content element is also text segment -- append
|
|
$content->[-1] .= $_;
|
|
}
|
|
else {
|
|
push( @$content, $_ );
|
|
}
|
|
}
|
|
}
|
|
return $self;
|
|
}
|
|
|
|
|
|
sub unshift_content {
|
|
my $self = shift;
|
|
return $self unless @_;
|
|
|
|
my $content = ( $self->{'_content'} ||= [] );
|
|
for ( reverse @_ ) { # so they get added in the order specified
|
|
if ( ref($_) eq 'ARRAY' ) {
|
|
|
|
# magically call new_from_lol
|
|
unshift @$content, $self->new_from_lol($_);
|
|
_weaken($content->[0]->{'_parent'} = $self);
|
|
}
|
|
elsif ( ref $_ ) { # insert an element
|
|
$_->detach if $_->{'_parent'};
|
|
_weaken($_->{'_parent'} = $self);
|
|
unshift( @$content, $_ );
|
|
}
|
|
else { # insert text segment
|
|
if ( @$content && !ref $content->[0] ) {
|
|
|
|
# last content element is also text segment -- prepend
|
|
$content->[0] = $_ . $content->[0];
|
|
}
|
|
else {
|
|
unshift( @$content, $_ );
|
|
}
|
|
}
|
|
}
|
|
return $self;
|
|
}
|
|
|
|
# Cf. splice ARRAY,OFFSET,LENGTH,LIST
|
|
|
|
|
|
sub splice_content {
|
|
my ( $self, $offset, $length, @to_add ) = @_;
|
|
Carp::croak "splice_content requires at least one argument"
|
|
if @_ < 2; # at least $h->splice_content($offset);
|
|
|
|
my $content = ( $self->{'_content'} ||= [] );
|
|
|
|
# prep the list
|
|
|
|
my @out;
|
|
if ( @_ > 2 ) { # self, offset, length, ...
|
|
foreach my $n (@to_add) {
|
|
if ( ref($n) eq 'ARRAY' ) {
|
|
$n = $self->new_from_lol($n);
|
|
_weaken($n->{'_parent'} = $self);
|
|
}
|
|
elsif ( ref($n) ) {
|
|
$n->detach;
|
|
_weaken($n->{'_parent'} = $self);
|
|
}
|
|
}
|
|
@out = splice @$content, $offset, $length, @to_add;
|
|
}
|
|
else { # self, offset
|
|
@out = splice @$content, $offset;
|
|
}
|
|
foreach my $n (@out) {
|
|
$n->{'_parent'} = undef if ref $n;
|
|
}
|
|
return @out;
|
|
}
|
|
|
|
|
|
sub detach {
|
|
my $self = $_[0];
|
|
return undef unless ( my $parent = $self->{'_parent'} );
|
|
$self->{'_parent'} = undef;
|
|
my $cohort = $parent->{'_content'} || return $parent;
|
|
@$cohort = grep { not( ref($_) and $_ eq $self ) } @$cohort;
|
|
|
|
# filter $self out, if parent has any evident content
|
|
|
|
return $parent;
|
|
}
|
|
|
|
|
|
sub detach_content {
|
|
my $c = $_[0]->{'_content'} || return (); # in case of no content
|
|
for (@$c) {
|
|
$_->{'_parent'} = undef if ref $_;
|
|
}
|
|
return splice @$c;
|
|
}
|
|
|
|
|
|
sub replace_with {
|
|
my ( $self, @replacers ) = @_;
|
|
Carp::croak "the target node has no parent"
|
|
unless my ($parent) = $self->{'_parent'};
|
|
|
|
my $parent_content = $parent->{'_content'};
|
|
Carp::croak "the target node's parent has no content!?"
|
|
unless $parent_content and @$parent_content;
|
|
|
|
my $replacers_contains_self;
|
|
for (@replacers) {
|
|
if ( !ref $_ ) {
|
|
|
|
# noop
|
|
}
|
|
elsif ( $_ eq $self ) {
|
|
|
|
# noop, but check that it's there just once.
|
|
Carp::croak "Replacement list contains several copies of target!"
|
|
if $replacers_contains_self++;
|
|
}
|
|
elsif ( $_ eq $parent ) {
|
|
Carp::croak "Can't replace an item with its parent!";
|
|
}
|
|
elsif ( ref($_) eq 'ARRAY' ) {
|
|
$_ = $self->new_from_lol($_);
|
|
_weaken($_->{'_parent'} = $parent);
|
|
}
|
|
else {
|
|
$_->detach;
|
|
_weaken($_->{'_parent'} = $parent);
|
|
|
|
# each of these are necessary
|
|
}
|
|
} # for @replacers
|
|
@$parent_content = map { ( ref($_) and $_ eq $self ) ? @replacers : $_ }
|
|
@$parent_content;
|
|
|
|
$self->{'_parent'} = undef unless $replacers_contains_self;
|
|
|
|
# if replacers does contain self, then the parent attribute is fine as-is
|
|
|
|
return $self;
|
|
}
|
|
|
|
|
|
sub preinsert {
|
|
my $self = shift;
|
|
return $self unless @_;
|
|
return $self->replace_with( @_, $self );
|
|
}
|
|
|
|
|
|
sub postinsert {
|
|
my $self = shift;
|
|
return $self unless @_;
|
|
return $self->replace_with( $self, @_ );
|
|
}
|
|
|
|
|
|
sub replace_with_content {
|
|
my $self = $_[0];
|
|
Carp::croak "the target node has no parent"
|
|
unless my ($parent) = $self->{'_parent'};
|
|
|
|
my $parent_content = $parent->{'_content'};
|
|
Carp::croak "the target node's parent has no content!?"
|
|
unless $parent_content and @$parent_content;
|
|
|
|
my $content_r = $self->{'_content'} || [];
|
|
@$parent_content = map { ( ref($_) and $_ eq $self ) ? @$content_r : $_ }
|
|
@$parent_content;
|
|
|
|
$self->{'_parent'} = undef; # detach $self from its parent
|
|
|
|
# Update parentage link, removing from $self's content list
|
|
for ( splice @$content_r ) { _weaken($_->{'_parent'} = $parent) if ref $_ }
|
|
|
|
return $self; # note: doesn't destroy it.
|
|
}
|
|
|
|
|
|
sub delete_content {
|
|
for (
|
|
splice @{
|
|
delete( $_[0]->{'_content'} )
|
|
|
|
# Deleting it here (while holding its value, for the moment)
|
|
# will keep calls to detach() from trying to uselessly filter
|
|
# the list (as they won't be able to see it once it's been
|
|
# deleted)
|
|
|| return ( $_[0] ) # in case of no content
|
|
},
|
|
0
|
|
|
|
# the splice is so we can null the array too, just in case
|
|
# something somewhere holds a ref to it
|
|
)
|
|
{
|
|
$_->delete if ref $_;
|
|
}
|
|
$_[0];
|
|
}
|
|
|
|
|
|
# two handy aliases
|
|
sub destroy { shift->delete(@_) }
|
|
sub destroy_content { shift->delete_content(@_) }
|
|
|
|
sub delete {
|
|
my $self = $_[0];
|
|
$self->delete_content # recurse down
|
|
if $self->{'_content'} && @{ $self->{'_content'} };
|
|
|
|
$self->detach if $self->{'_parent'} and $self->{'_parent'}{'_content'};
|
|
|
|
# not the typical case
|
|
|
|
%$self = (); # null out the whole object on the way out
|
|
return;
|
|
}
|
|
|
|
|
|
sub clone {
|
|
|
|
#print "Cloning $_[0]\n";
|
|
my $it = shift;
|
|
Carp::croak "clone() can be called only as an object method"
|
|
unless ref $it;
|
|
Carp::croak "clone() takes no arguments" if @_;
|
|
|
|
my $new = bless {%$it}, ref($it); # COPY!!! HOOBOY!
|
|
delete @$new{ '_content', '_parent', '_pos', '_head', '_body' };
|
|
|
|
# clone any contents
|
|
if ( $it->{'_content'} and @{ $it->{'_content'} } ) {
|
|
$new->{'_content'}
|
|
= [ ref($it)->clone_list( @{ $it->{'_content'} } ) ];
|
|
for ( @{ $new->{'_content'} } ) {
|
|
_weaken($_->{'_parent'} = $new) if ref $_;
|
|
}
|
|
}
|
|
|
|
return $new;
|
|
}
|
|
|
|
|
|
sub clone_list {
|
|
Carp::croak "clone_list can be called only as a class method"
|
|
if ref shift @_;
|
|
|
|
# all that does is get me here
|
|
return map {
|
|
ref($_)
|
|
? $_->clone # copy by method
|
|
: $_ # copy by evaluation
|
|
} @_;
|
|
}
|
|
|
|
|
|
sub normalize_content {
|
|
my $start = $_[0];
|
|
my $c;
|
|
return
|
|
unless $c = $start->{'_content'} and ref $c and @$c; # nothing to do
|
|
# TODO: if we start having text elements, deal with catenating those too?
|
|
my @stretches = (undef); # start with a barrier
|
|
|
|
# I suppose this could be rewritten to treat stretches as it goes, instead
|
|
# of at the end. But feh.
|
|
|
|
# Scan:
|
|
for ( my $i = 0; $i < @$c; ++$i ) {
|
|
if ( defined $c->[$i] and ref $c->[$i] ) { # not a text segment
|
|
if ( $stretches[0] ) {
|
|
|
|
# put in a barrier
|
|
if ( $stretches[0][1] == 1 ) {
|
|
|
|
#print "Nixing stretch at ", $i-1, "\n";
|
|
undef $stretches[0]; # nix the previous one-node "stretch"
|
|
}
|
|
else {
|
|
|
|
#print "End of stretch at ", $i-1, "\n";
|
|
unshift @stretches, undef;
|
|
}
|
|
}
|
|
|
|
# else no need for a barrier
|
|
}
|
|
else { # text segment
|
|
$c->[$i] = '' unless defined $c->[$i];
|
|
if ( $stretches[0] ) {
|
|
++$stretches[0][1]; # increase length
|
|
}
|
|
else {
|
|
|
|
#print "New stretch at $i\n";
|
|
unshift @stretches, [ $i, 1 ]; # start and length
|
|
}
|
|
}
|
|
}
|
|
|
|
# Now combine. Note that @stretches is in reverse order, so the indexes
|
|
# still make sense as we work our way thru (i.e., backwards thru $c).
|
|
foreach my $s (@stretches) {
|
|
if ( $s and $s->[1] > 1 ) {
|
|
|
|
#print "Stretch at ", $s->[0], " for ", $s->[1], "\n";
|
|
$c->[ $s->[0] ]
|
|
.= join( '', splice( @$c, $s->[0] + 1, $s->[1] - 1 ) )
|
|
|
|
# append the subsequent ones onto the first one.
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
|
|
sub delete_ignorable_whitespace {
|
|
|
|
# This doesn't delete all sorts of whitespace that won't actually
|
|
# be used in rendering, tho -- that's up to the rendering application.
|
|
# For example:
|
|
# <input type='text' name='foo'>
|
|
# [some whitespace]
|
|
# <input type='text' name='bar'>
|
|
# The WS between the two elements /will/ get used by the renderer.
|
|
# But here:
|
|
# <input type='hidden' name='foo' value='1'>
|
|
# [some whitespace]
|
|
# <input type='text' name='bar' value='2'>
|
|
# the WS between them won't be rendered in any way, presumably.
|
|
|
|
#my $Debug = 4;
|
|
die "delete_ignorable_whitespace can be called only as an object method"
|
|
unless ref $_[0];
|
|
|
|
print "About to tighten up...\n" if $Debug > 2;
|
|
my (@to_do) = ( $_[0] ); # Start off.
|
|
my ( $i, $sibs, $ptag, $this ); # scratch for the loop...
|
|
while (@to_do) {
|
|
if ( ( $ptag = ( $this = shift @to_do )->{'_tag'} ) eq 'pre'
|
|
or $ptag eq 'textarea'
|
|
or $HTML::Tagset::isCDATA_Parent{$ptag} )
|
|
{
|
|
|
|
# block the traversal under those
|
|
print "Blocking traversal under $ptag\n" if $Debug;
|
|
next;
|
|
}
|
|
next unless ( $sibs = $this->{'_content'} and @$sibs );
|
|
for ( $i = $#$sibs; $i >= 0; --$i ) { # work backwards thru the list
|
|
if ( ref $sibs->[$i] ) {
|
|
unshift @to_do, $sibs->[$i];
|
|
|
|
# yes, this happens in pre order -- we're going backwards
|
|
# thru this sibling list. I doubt it actually matters, tho.
|
|
next;
|
|
}
|
|
next if $sibs->[$i] =~ m<[^\n\r\f\t ]>s; # it's /all/ whitespace
|
|
|
|
print "Under $ptag whose canTighten ",
|
|
"value is ", 0 + $HTML::Element::canTighten{$ptag}, ".\n"
|
|
if $Debug > 3;
|
|
|
|
# It's all whitespace...
|
|
|
|
if ( $i == 0 ) {
|
|
if ( @$sibs == 1 ) { # I'm an only child
|
|
next unless $HTML::Element::canTighten{$ptag}; # parent
|
|
}
|
|
else { # I'm leftmost of many
|
|
# if either my parent or sib are eligible, I'm good.
|
|
next
|
|
unless $HTML::Element::canTighten{$ptag} # parent
|
|
or (ref $sibs->[1]
|
|
and $HTML::Element::canTighten{ $sibs->[1]
|
|
{'_tag'} } # right sib
|
|
);
|
|
}
|
|
}
|
|
elsif ( $i == $#$sibs ) { # I'm rightmost of many
|
|
# if either my parent or sib are eligible, I'm good.
|
|
next
|
|
unless $HTML::Element::canTighten{$ptag} # parent
|
|
or (ref $sibs->[ $i - 1 ]
|
|
and $HTML::Element::canTighten{ $sibs->[ $i - 1 ]
|
|
{'_tag'} } # left sib
|
|
);
|
|
}
|
|
else { # I'm the piggy in the middle
|
|
# My parent doesn't matter -- it all depends on my sibs
|
|
next
|
|
unless ref $sibs->[ $i - 1 ]
|
|
or ref $sibs->[ $i + 1 ];
|
|
|
|
# if NEITHER sib is a node, quit
|
|
|
|
next if
|
|
|
|
# bailout condition: if BOTH are INeligible nodes
|
|
# (as opposed to being text, or being eligible nodes)
|
|
ref $sibs->[ $i - 1 ]
|
|
and ref $sibs->[ $i + 1 ]
|
|
and !$HTML::Element::canTighten{ $sibs->[ $i - 1 ]
|
|
{'_tag'} } # left sib
|
|
and !$HTML::Element::canTighten{ $sibs->[ $i + 1 ]
|
|
{'_tag'} } # right sib
|
|
;
|
|
}
|
|
|
|
# Unknown tags aren't in canTighten and so AREN'T subject to tightening
|
|
|
|
print " delendum: child $i of $ptag\n" if $Debug > 3;
|
|
splice @$sibs, $i, 1;
|
|
}
|
|
|
|
# end of the loop-over-children
|
|
}
|
|
|
|
# end of the while loop.
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
sub insert_element {
|
|
my ( $self, $tag, $implicit ) = @_;
|
|
return $self->pos() unless $tag; # noop if nothing to insert
|
|
|
|
my $e;
|
|
if ( ref $tag ) {
|
|
$e = $tag;
|
|
$tag = $e->tag;
|
|
}
|
|
else { # just a tag name -- so make the element
|
|
$e = $self->element_class->new($tag);
|
|
++( $self->{'_element_count'} ) if exists $self->{'_element_count'};
|
|
|
|
# undocumented. see TreeBuilder.
|
|
}
|
|
|
|
$e->{'_implicit'} = 1 if $implicit;
|
|
|
|
my $pos = $self->{'_pos'};
|
|
$pos = $self unless defined $pos;
|
|
|
|
$pos->push_content($e);
|
|
|
|
$self->{'_pos'} = $pos = $e
|
|
unless $self->_empty_element_map->{$tag} || $e->{'_empty_element'};
|
|
|
|
$pos;
|
|
}
|
|
|
|
#==========================================================================
|
|
# Some things to override in XML::Element
|
|
|
|
sub _empty_element_map {
|
|
\%HTML::Element::emptyElement;
|
|
}
|
|
|
|
sub _fold_case_LC {
|
|
if (wantarray) {
|
|
shift;
|
|
map lc($_), @_;
|
|
}
|
|
else {
|
|
return lc( $_[1] );
|
|
}
|
|
}
|
|
|
|
sub _fold_case_NOT {
|
|
if (wantarray) {
|
|
shift;
|
|
@_;
|
|
}
|
|
else {
|
|
return $_[1];
|
|
}
|
|
}
|
|
|
|
*_fold_case = \&_fold_case_LC;
|
|
|
|
#==========================================================================
|
|
|
|
#=head1 DUMPING METHODS
|
|
|
|
|
|
sub dump {
|
|
my ( $self, $fh, $depth ) = @_;
|
|
$fh = *STDOUT{IO} unless defined $fh;
|
|
$depth = 0 unless defined $depth;
|
|
print $fh " " x $depth, $self->starttag, " \@", $self->address,
|
|
$self->{'_implicit'} ? " (IMPLICIT)\n" : "\n";
|
|
for ( @{ $self->{'_content'} } ) {
|
|
if ( ref $_ ) { # element
|
|
$_->dump( $fh, $depth + 1 ); # recurse
|
|
}
|
|
else { # text node
|
|
print $fh " " x ( $depth + 1 );
|
|
if ( length($_) > 65 or m<[\x00-\x1F]> ) {
|
|
|
|
# it needs prettyin' up somehow or other
|
|
my $x
|
|
= ( length($_) <= 65 )
|
|
? $_
|
|
: ( substr( $_, 0, 65 ) . '...' );
|
|
$x =~ s<([\x00-\x1F])>
|
|
<'\\x'.(unpack("H2",$1))>eg;
|
|
print $fh qq{"$x"\n};
|
|
}
|
|
else {
|
|
print $fh qq{"$_"\n};
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
sub as_HTML {
|
|
my ( $self, $entities, $indent, $omissible_map ) = @_;
|
|
|
|
#my $indent_on = defined($indent) && length($indent);
|
|
my @html = ();
|
|
|
|
$omissible_map ||= \%HTML::Element::optionalEndTag;
|
|
my $empty_element_map = $self->_empty_element_map;
|
|
|
|
my $last_tag_tightenable = 0;
|
|
my $this_tag_tightenable = 0;
|
|
my $nonindentable_ancestors = 0; # count of nonindentible tags over us.
|
|
|
|
my ( $tag, $node, $start, $depth ); # per-iteration scratch
|
|
|
|
if ( defined($indent) && length($indent) ) {
|
|
$self->traverse(
|
|
sub {
|
|
( $node, $start, $depth ) = @_;
|
|
if ( ref $node ) { # it's an element
|
|
|
|
# detect bogus classes. RT #35948, #61673
|
|
$node->can('starttag')
|
|
or Carp::confess( "Object of class "
|
|
. ref($node)
|
|
. " cannot be processed by HTML::Element" );
|
|
|
|
$tag = $node->{'_tag'};
|
|
|
|
if ($start) { # on the way in
|
|
if (( $this_tag_tightenable
|
|
= $HTML::Element::canTighten{$tag}
|
|
)
|
|
and !$nonindentable_ancestors
|
|
and $last_tag_tightenable
|
|
)
|
|
{
|
|
push
|
|
@html,
|
|
"\n",
|
|
$indent x $depth,
|
|
$node->starttag($entities),
|
|
;
|
|
}
|
|
else {
|
|
push( @html, $node->starttag($entities) );
|
|
}
|
|
$last_tag_tightenable = $this_tag_tightenable;
|
|
|
|
++$nonindentable_ancestors
|
|
if $tag eq 'pre' or $tag eq 'textarea'
|
|
or $HTML::Tagset::isCDATA_Parent{$tag};
|
|
|
|
}
|
|
elsif (
|
|
not( $empty_element_map->{$tag}
|
|
or $omissible_map->{$tag} )
|
|
)
|
|
{
|
|
|
|
# on the way out
|
|
if ( $tag eq 'pre' or $tag eq 'textarea'
|
|
or $HTML::Tagset::isCDATA_Parent{$tag} )
|
|
{
|
|
--$nonindentable_ancestors;
|
|
$last_tag_tightenable
|
|
= $HTML::Element::canTighten{$tag};
|
|
push @html, $node->endtag;
|
|
|
|
}
|
|
else { # general case
|
|
if (( $this_tag_tightenable
|
|
= $HTML::Element::canTighten{$tag}
|
|
)
|
|
and !$nonindentable_ancestors
|
|
and $last_tag_tightenable
|
|
)
|
|
{
|
|
push
|
|
@html,
|
|
"\n",
|
|
$indent x $depth,
|
|
$node->endtag,
|
|
;
|
|
}
|
|
else {
|
|
push @html, $node->endtag;
|
|
}
|
|
$last_tag_tightenable = $this_tag_tightenable;
|
|
|
|
#print "$tag tightenable: $this_tag_tightenable\n";
|
|
}
|
|
}
|
|
}
|
|
else { # it's a text segment
|
|
|
|
$last_tag_tightenable = 0; # I guess this is right
|
|
HTML::Entities::encode_entities( $node, $entities )
|
|
|
|
# That does magic things if $entities is undef.
|
|
unless (
|
|
( defined($entities) && !length($entities) )
|
|
|
|
# If there's no entity to encode, don't call it
|
|
|| $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }
|
|
|
|
# To keep from amp-escaping children of script et al.
|
|
# That doesn't deal with descendants; but then, CDATA
|
|
# parents shouldn't /have/ descendants other than a
|
|
# text children (or comments?)
|
|
|| $encoded_content
|
|
);
|
|
if ($nonindentable_ancestors) {
|
|
push @html, $node; # say no go
|
|
}
|
|
else {
|
|
if ($last_tag_tightenable) {
|
|
$node =~ s<[\n\r\f\t ]+>< >s;
|
|
|
|
#$node =~ s< $><>s;
|
|
$node =~ s<^ ><>s;
|
|
push
|
|
@html,
|
|
"\n",
|
|
$indent x $depth,
|
|
$node,
|
|
|
|
#Text::Wrap::wrap($indent x $depth, $indent x $depth, "\n" . $node)
|
|
;
|
|
}
|
|
else {
|
|
push
|
|
@html,
|
|
$node,
|
|
|
|
#Text::Wrap::wrap('', $indent x $depth, $node)
|
|
;
|
|
}
|
|
}
|
|
}
|
|
1; # keep traversing
|
|
}
|
|
); # End of parms to traverse()
|
|
}
|
|
else { # no indenting -- much simpler code
|
|
$self->traverse(
|
|
sub {
|
|
( $node, $start ) = @_;
|
|
if ( ref $node ) {
|
|
|
|
# detect bogus classes. RT #35948
|
|
$node->isa( $self->element_class )
|
|
or Carp::confess( "Object of class "
|
|
. ref($node)
|
|
. " cannot be processed by HTML::Element" );
|
|
|
|
$tag = $node->{'_tag'};
|
|
if ($start) { # on the way in
|
|
push( @html, $node->starttag($entities) );
|
|
}
|
|
elsif (
|
|
not( $empty_element_map->{$tag}
|
|
or $omissible_map->{$tag} )
|
|
)
|
|
{
|
|
|
|
# on the way out
|
|
push( @html, $node->endtag );
|
|
}
|
|
}
|
|
else {
|
|
|
|
# simple text content
|
|
HTML::Entities::encode_entities( $node, $entities )
|
|
|
|
# That does magic things if $entities is undef.
|
|
unless (
|
|
( defined($entities) && !length($entities) )
|
|
|
|
# If there's no entity to encode, don't call it
|
|
|| $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }
|
|
|
|
# To keep from amp-escaping children of script et al.
|
|
# That doesn't deal with descendants; but then, CDATA
|
|
# parents shouldn't /have/ descendants other than a
|
|
# text children (or comments?)
|
|
|| $encoded_content
|
|
);
|
|
push( @html, $node );
|
|
}
|
|
1; # keep traversing
|
|
}
|
|
); # End of parms to traverse()
|
|
}
|
|
|
|
if ( $self->{_store_declarations} && defined $self->{_decl} ) {
|
|
unshift @html, sprintf "<!%s>\n", $self->{_decl}->{text};
|
|
}
|
|
|
|
return join( '', @html );
|
|
}
|
|
|
|
|
|
sub as_text {
|
|
|
|
# Yet another iteratively implemented traverser
|
|
my ( $this, %options ) = @_;
|
|
my $skip_dels = $options{'skip_dels'} || 0;
|
|
my (@pile) = ($this);
|
|
my $tag;
|
|
my $text = '';
|
|
while (@pile) {
|
|
if ( !defined( $pile[0] ) ) { # undef!
|
|
# no-op
|
|
}
|
|
elsif ( !ref( $pile[0] ) ) { # text bit! save it!
|
|
$text .= shift @pile;
|
|
}
|
|
else { # it's a ref -- traverse under it
|
|
unshift @pile, @{ $this->{'_content'} || $nillio }
|
|
unless ( $tag = ( $this = shift @pile )->{'_tag'} ) eq 'style'
|
|
or $tag eq 'script'
|
|
or ( $skip_dels and $tag eq 'del' );
|
|
}
|
|
}
|
|
return $text;
|
|
}
|
|
|
|
# extra_chars added for RT #26436
|
|
sub as_trimmed_text {
|
|
my ( $this, %options ) = @_;
|
|
my $text = $this->as_text(%options);
|
|
my $extra_chars = defined $options{'extra_chars'}
|
|
? $options{'extra_chars'} : '';
|
|
|
|
$text =~ s/[\n\r\f\t$extra_chars ]+$//s;
|
|
$text =~ s/^[\n\r\f\t$extra_chars ]+//s;
|
|
$text =~ s/[\n\r\f\t$extra_chars ]+/ /g;
|
|
return $text;
|
|
}
|
|
|
|
sub as_text_trimmed { shift->as_trimmed_text(@_) } # alias, because I forget
|
|
|
|
|
|
# TODO: make it wrap, if not indent?
|
|
|
|
sub as_XML {
|
|
|
|
# based an as_HTML
|
|
my ($self) = @_;
|
|
|
|
#my $indent_on = defined($indent) && length($indent);
|
|
my @xml = ();
|
|
my $empty_element_map = $self->_empty_element_map;
|
|
|
|
my ( $tag, $node, $start ); # per-iteration scratch
|
|
$self->traverse(
|
|
sub {
|
|
( $node, $start ) = @_;
|
|
if ( ref $node ) { # it's an element
|
|
$tag = $node->{'_tag'};
|
|
if ($start) { # on the way in
|
|
|
|
foreach my $attr ( $node->all_attr_names() ) {
|
|
Carp::croak(
|
|
"$tag has an invalid attribute name '$attr'")
|
|
unless ( $attr eq '/' || $self->_valid_name($attr) );
|
|
}
|
|
|
|
if ( $empty_element_map->{$tag}
|
|
and !@{ $node->{'_content'} || $nillio } )
|
|
{
|
|
push( @xml, $node->starttag_XML( undef, 1 ) );
|
|
}
|
|
else {
|
|
push( @xml, $node->starttag_XML(undef) );
|
|
}
|
|
}
|
|
else { # on the way out
|
|
unless ( $empty_element_map->{$tag}
|
|
and !@{ $node->{'_content'} || $nillio } )
|
|
{
|
|
push( @xml, $node->endtag_XML() );
|
|
} # otherwise it will have been an <... /> tag.
|
|
}
|
|
}
|
|
else { # it's just text
|
|
_xml_escape($node);
|
|
push( @xml, $node );
|
|
}
|
|
1; # keep traversing
|
|
}
|
|
);
|
|
|
|
join( '', @xml, "\n" );
|
|
}
|
|
|
|
sub _xml_escape {
|
|
|
|
# DESTRUCTIVE (a.k.a. "in-place")
|
|
# Five required escapes: http://www.w3.org/TR/2006/REC-xml11-20060816/#syntax
|
|
# We allow & if it's part of a valid escape already: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-references
|
|
foreach my $x (@_) {
|
|
|
|
# In strings with no encoded entities all & should be encoded.
|
|
if ($encoded_content) {
|
|
$x
|
|
=~ s/&(?! # An ampersand that isn't followed by...
|
|
(\#\d+; | # A hash mark, digits and semicolon, or
|
|
\#x[\da-f]+; | # A hash mark, "x", hex digits and semicolon, or
|
|
$START_CHAR$NAME_CHAR+; ) # A valid unicode entity name and semicolon
|
|
)/&/gx; # Needs to be escaped to amp
|
|
}
|
|
else {
|
|
$x =~ s/&/&/g;
|
|
}
|
|
|
|
# simple character escapes
|
|
$x =~ s/</</g;
|
|
$x =~ s/>/>/g;
|
|
$x =~ s/"/"/g;
|
|
$x =~ s/'/'/g;
|
|
}
|
|
return;
|
|
}
|
|
|
|
|
|
# NOTES:
|
|
#
|
|
# It's been suggested that attribute names be made :-keywords:
|
|
# (:_tag "img" :border 0 :src "pie.png" :usemap "#main.map")
|
|
# However, it seems that Scheme has no such data type as :-keywords.
|
|
# So, for the moment at least, I tend toward simplicity, uniformity,
|
|
# and universality, where everything a string or a list.
|
|
|
|
sub as_Lisp_form {
|
|
my @out;
|
|
|
|
my $sub;
|
|
my $depth = 0;
|
|
my ( @list, $val );
|
|
$sub = sub { # Recursor
|
|
my $self = $_[0];
|
|
@list = ( '_tag', $self->{'_tag'} );
|
|
@list = () unless defined $list[-1]; # unlikely
|
|
|
|
for ( sort keys %$self ) { # predictable ordering
|
|
next
|
|
if $_ eq '_content'
|
|
or $_ eq '_tag'
|
|
or $_ eq '_parent'
|
|
or $_ eq '/';
|
|
|
|
# Leave the other private attributes, I guess.
|
|
push @list, $_, $val
|
|
if defined( $val = $self->{$_} ); # and !ref $val;
|
|
}
|
|
|
|
for (@list) {
|
|
|
|
# octal-escape it
|
|
s<([^\x20\x21\x23\x27-\x5B\x5D-\x7E])>
|
|
<sprintf('\\%03o',ord($1))>eg;
|
|
$_ = qq{"$_"};
|
|
}
|
|
push @out, ( ' ' x $depth ) . '(' . join ' ', splice @list;
|
|
if ( @{ $self->{'_content'} || $nillio } ) {
|
|
$out[-1] .= " \"_content\" (\n";
|
|
++$depth;
|
|
foreach my $c ( @{ $self->{'_content'} } ) {
|
|
if ( ref($c) ) {
|
|
|
|
# an element -- recurse
|
|
$sub->($c);
|
|
}
|
|
else {
|
|
|
|
# a text segment -- stick it in and octal-escape it
|
|
push @out, $c;
|
|
$out[-1] =~ s<([^\x20\x21\x23\x27-\x5B\x5D-\x7E])>
|
|
<sprintf('\\%03o',ord($1))>eg;
|
|
|
|
# And quote and indent it.
|
|
$out[-1] .= "\"\n";
|
|
$out[-1] = ( ' ' x $depth ) . '"' . $out[-1];
|
|
}
|
|
}
|
|
--$depth;
|
|
substr( $out[-1], -1 )
|
|
= "))\n"; # end of _content and of the element
|
|
}
|
|
else {
|
|
$out[-1] .= ")\n";
|
|
}
|
|
return;
|
|
};
|
|
|
|
$sub->( $_[0] );
|
|
undef $sub;
|
|
return join '', @out;
|
|
}
|
|
|
|
|
|
sub format {
|
|
my ( $self, $formatter ) = @_;
|
|
unless ( defined $formatter ) {
|
|
# RECOMMEND PREREQ: HTML::FormatText
|
|
require HTML::FormatText;
|
|
$formatter = HTML::FormatText->new();
|
|
}
|
|
$formatter->format($self);
|
|
}
|
|
|
|
|
|
sub starttag {
|
|
my ( $self, $entities ) = @_;
|
|
|
|
my $name = $self->{'_tag'};
|
|
|
|
return $self->{'text'} if $name eq '~literal';
|
|
return "<!" . $self->{'text'} . ">" if $name eq '~declaration';
|
|
return "<?" . $self->{'text'} . ">" if $name eq '~pi';
|
|
|
|
if ( $name eq '~comment' ) {
|
|
if ( ref( $self->{'text'} || '' ) eq 'ARRAY' ) {
|
|
|
|
# Does this ever get used? And is this right?
|
|
return
|
|
"<!"
|
|
. join( ' ', map( "--$_--", @{ $self->{'text'} } ) ) . ">";
|
|
}
|
|
else {
|
|
return "<!--" . $self->{'text'} . "-->";
|
|
}
|
|
}
|
|
|
|
my $tag = $html_uc ? "<\U$name" : "<\L$name";
|
|
my $val;
|
|
for ( sort keys %$self ) { # predictable ordering
|
|
next if !length $_ or m/^_/s or $_ eq '/';
|
|
$val = $self->{$_};
|
|
next if !defined $val; # or ref $val;
|
|
if ($_ eq $val && # if attribute is boolean, for this element
|
|
exists( $HTML::Element::boolean_attr{$name} )
|
|
&& (ref( $HTML::Element::boolean_attr{$name} )
|
|
? $HTML::Element::boolean_attr{$name}{$_}
|
|
: $HTML::Element::boolean_attr{$name} eq $_
|
|
)
|
|
)
|
|
{
|
|
$tag .= $html_uc ? " \U$_" : " \L$_";
|
|
}
|
|
else { # non-boolean attribute
|
|
|
|
if ( ref $val eq 'HTML::Element'
|
|
and $val->{_tag} eq '~literal' )
|
|
{
|
|
$val = $val->{text};
|
|
}
|
|
else {
|
|
HTML::Entities::encode_entities( $val, $entities )
|
|
unless (
|
|
defined($entities) && !length($entities)
|
|
|| $encoded_content
|
|
|
|
);
|
|
}
|
|
|
|
$val = qq{"$val"};
|
|
$tag .= $html_uc ? qq{ \U$_\E=$val} : qq{ \L$_\E=$val};
|
|
}
|
|
} # for keys
|
|
if ( scalar $self->content_list == 0
|
|
&& $self->_empty_element_map->{ $self->tag } )
|
|
{
|
|
return $tag . " />";
|
|
}
|
|
else {
|
|
return $tag . ">";
|
|
}
|
|
}
|
|
|
|
|
|
sub starttag_XML {
|
|
my ($self) = @_;
|
|
|
|
# and a third parameter to signal emptiness?
|
|
|
|
my $name = $self->{'_tag'};
|
|
|
|
return $self->{'text'} if $name eq '~literal';
|
|
return '<!' . $self->{'text'} . '>' if $name eq '~declaration';
|
|
return "<?" . $self->{'text'} . "?>" if $name eq '~pi';
|
|
|
|
if ( $name eq '~comment' ) {
|
|
if ( ref( $self->{'text'} || '' ) eq 'ARRAY' ) {
|
|
|
|
# Does this ever get used? And is this right?
|
|
$name = join( ' ', @{ $self->{'text'} } );
|
|
}
|
|
else {
|
|
$name = $self->{'text'};
|
|
}
|
|
$name =~ s/--/--/g; # can't have double --'s in XML comments
|
|
return "<!--$name-->";
|
|
}
|
|
|
|
my $tag = "<$name";
|
|
my $val;
|
|
for ( sort keys %$self ) { # predictable ordering
|
|
next if !length $_ or m/^_/s or $_ eq '/';
|
|
|
|
# Hm -- what to do if val is undef?
|
|
# I suppose that shouldn't ever happen.
|
|
next if !defined( $val = $self->{$_} ); # or ref $val;
|
|
_xml_escape($val);
|
|
$tag .= qq{ $_="$val"};
|
|
}
|
|
@_ == 3 ? "$tag />" : "$tag>";
|
|
}
|
|
|
|
|
|
sub endtag {
|
|
$html_uc ? "</\U$_[0]->{'_tag'}>" : "</\L$_[0]->{'_tag'}>";
|
|
}
|
|
|
|
sub endtag_XML {
|
|
"</$_[0]->{'_tag'}>";
|
|
}
|
|
|
|
#==========================================================================
|
|
# This, ladies and germs, is an iterative implementation of a
|
|
# recursive algorithm. DON'T TRY THIS AT HOME.
|
|
# Basically, the algorithm says:
|
|
#
|
|
# To traverse:
|
|
# 1: pre-order visit this node
|
|
# 2: traverse any children of this node
|
|
# 3: post-order visit this node, unless it's a text segment,
|
|
# or a prototypically empty node (like "br", etc.)
|
|
# Add to that the consideration of the callbacks' return values,
|
|
# so you can block visitation of the children, or siblings, or
|
|
# abort the whole excursion, etc.
|
|
#
|
|
# So, why all this hassle with making the code iterative?
|
|
# It makes for real speed, because it eliminates the whole
|
|
# hassle of Perl having to allocate scratch space for each
|
|
# instance of the recursive sub. Since the algorithm
|
|
# is basically simple (and not all recursive ones are!) and
|
|
# has few necessary lexicals (basically just the current node's
|
|
# content list, and the current position in it), it was relatively
|
|
# straightforward to store that information not as the frame
|
|
# of a sub, but as a stack, i.e., a simple Perl array (well, two
|
|
# of them, actually: one for content-listrefs, one for indexes of
|
|
# current position in each of those).
|
|
|
|
my $NIL = [];
|
|
|
|
sub traverse {
|
|
my ( $start, $callback, $ignore_text ) = @_;
|
|
|
|
Carp::croak "traverse can be called only as an object method"
|
|
unless ref $start;
|
|
|
|
Carp::croak('must provide a callback for traverse()!')
|
|
unless defined $callback and ref $callback;
|
|
|
|
# Elementary type-checking:
|
|
my ( $c_pre, $c_post );
|
|
if ( UNIVERSAL::isa( $callback, 'CODE' ) ) {
|
|
$c_pre = $c_post = $callback;
|
|
}
|
|
elsif ( UNIVERSAL::isa( $callback, 'ARRAY' ) ) {
|
|
( $c_pre, $c_post ) = @$callback;
|
|
Carp::croak(
|
|
"pre-order callback \"$c_pre\" is true but not a coderef!")
|
|
if $c_pre and not UNIVERSAL::isa( $c_pre, 'CODE' );
|
|
Carp::croak(
|
|
"pre-order callback \"$c_post\" is true but not a coderef!")
|
|
if $c_post and not UNIVERSAL::isa( $c_post, 'CODE' );
|
|
return $start unless $c_pre or $c_post;
|
|
|
|
# otherwise there'd be nothing to actually do!
|
|
}
|
|
else {
|
|
Carp::croak("$callback is not a known kind of reference")
|
|
unless ref($callback);
|
|
}
|
|
|
|
my $empty_element_map = $start->_empty_element_map;
|
|
|
|
my (@C) = [$start]; # a stack containing lists of children
|
|
my (@I) = (-1); # initial value must be -1 for each list
|
|
# a stack of indexes to current position in corresponding lists in @C
|
|
# In each of these, 0 is the active point
|
|
|
|
# scratch:
|
|
my ($rv, # return value of callback
|
|
$this, # current node
|
|
$content_r, # child list of $this
|
|
);
|
|
|
|
# THE BIG LOOP
|
|
while (@C) {
|
|
|
|
# Move to next item in this frame
|
|
if ( !defined( $I[0] ) or ++$I[0] >= @{ $C[0] } ) {
|
|
|
|
# We either went off the end of this list, or aborted the list
|
|
# So call the post-order callback:
|
|
if ( $c_post
|
|
and defined $I[0]
|
|
and @C > 1
|
|
|
|
# to keep the next line from autovivifying
|
|
and defined( $this = $C[1][ $I[1] ] ) # sanity, and
|
|
# suppress callbacks on exiting the fictional top frame
|
|
and ref($this) # sanity
|
|
and not(
|
|
$this->{'_empty_element'}
|
|
|| ( $empty_element_map->{ $this->{'_tag'} || '' }
|
|
&& !@{ $this->{'_content'} } ) # RT #49932
|
|
) # things that don't get post-order callbacks
|
|
)
|
|
{
|
|
shift @I;
|
|
shift @C;
|
|
|
|
#print "Post! at depth", scalar(@I), "\n";
|
|
$rv = $c_post->(
|
|
|
|
#map $_, # copy to avoid any messiness
|
|
$this, # 0: this
|
|
0, # 1: startflag (0 for post-order call)
|
|
@I - 1, # 2: depth
|
|
);
|
|
|
|
if ( defined($rv) and ref($rv) eq $travsignal_package ) {
|
|
$rv = $$rv; #deref
|
|
if ( $rv eq 'ABORT' ) {
|
|
last; # end of this excursion!
|
|
}
|
|
elsif ( $rv eq 'PRUNE' ) {
|
|
|
|
# NOOP on post!!
|
|
}
|
|
elsif ( $rv eq 'PRUNE_SOFTLY' ) {
|
|
|
|
# NOOP on post!!
|
|
}
|
|
elsif ( $rv eq 'OK' ) {
|
|
|
|
# noop
|
|
}
|
|
elsif ( $rv eq 'PRUNE_UP' ) {
|
|
$I[0] = undef;
|
|
}
|
|
else {
|
|
die "Unknown travsignal $rv\n";
|
|
|
|
# should never happen
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
shift @I;
|
|
shift @C;
|
|
}
|
|
next;
|
|
}
|
|
|
|
$this = $C[0][ $I[0] ];
|
|
|
|
if ($c_pre) {
|
|
if ( defined $this and ref $this ) { # element
|
|
$rv = $c_pre->(
|
|
|
|
#map $_, # copy to avoid any messiness
|
|
$this, # 0: this
|
|
1, # 1: startflag (1 for pre-order call)
|
|
@I - 1, # 2: depth
|
|
);
|
|
}
|
|
else { # text segment
|
|
next if $ignore_text;
|
|
$rv = $c_pre->(
|
|
|
|
#map $_, # copy to avoid any messiness
|
|
$this, # 0: this
|
|
1, # 1: startflag (1 for pre-order call)
|
|
@I - 1, # 2: depth
|
|
$C[1][ $I[1] ], # 3: parent
|
|
# And there will always be a $C[1], since
|
|
# we can't start traversing at a text node
|
|
$I[0] # 4: index of self in parent's content list
|
|
);
|
|
}
|
|
if ( not $rv ) { # returned false. Same as PRUNE.
|
|
next; # prune
|
|
}
|
|
elsif ( ref($rv) eq $travsignal_package ) {
|
|
$rv = $$rv; # deref
|
|
if ( $rv eq 'ABORT' ) {
|
|
last; # end of this excursion!
|
|
}
|
|
elsif ( $rv eq 'PRUNE' ) {
|
|
next;
|
|
}
|
|
elsif ( $rv eq 'PRUNE_SOFTLY' ) {
|
|
if (ref($this)
|
|
and not( $this->{'_empty_element'}
|
|
|| $empty_element_map->{ $this->{'_tag'} || '' } )
|
|
)
|
|
{
|
|
|
|
# push a dummy empty content list just to trigger a post callback
|
|
unshift @I, -1;
|
|
unshift @C, $NIL;
|
|
}
|
|
next;
|
|
}
|
|
elsif ( $rv eq 'OK' ) {
|
|
|
|
# noop
|
|
}
|
|
elsif ( $rv eq 'PRUNE_UP' ) {
|
|
$I[0] = undef;
|
|
next;
|
|
|
|
# equivalent of last'ing out of the current child list.
|
|
|
|
# Used to have PRUNE_UP_SOFTLY and ABORT_SOFTLY here, but the code
|
|
# for these was seriously upsetting, served no particularly clear
|
|
# purpose, and could not, I think, be easily implemented with a
|
|
# recursive routine. All bad things!
|
|
}
|
|
else {
|
|
die "Unknown travsignal $rv\n";
|
|
|
|
# should never happen
|
|
}
|
|
}
|
|
|
|
# else fall thru to meaning same as \'OK'.
|
|
}
|
|
|
|
# end of pre-order calling
|
|
|
|
# Now queue up content list for the current element...
|
|
if (ref $this
|
|
and not( # ...except for those which...
|
|
not( $content_r = $this->{'_content'} and @$content_r )
|
|
|
|
# ...have empty content lists...
|
|
and $this->{'_empty_element'}
|
|
|| $empty_element_map->{ $this->{'_tag'} || '' }
|
|
|
|
# ...and that don't get post-order callbacks
|
|
)
|
|
)
|
|
{
|
|
unshift @I, -1;
|
|
unshift @C, $content_r || $NIL;
|
|
|
|
#print $this->{'_tag'}, " ($this) adds content_r ", $C[0], "\n";
|
|
}
|
|
}
|
|
return $start;
|
|
}
|
|
|
|
|
|
sub is_inside {
|
|
my $self = shift;
|
|
return 0 unless @_; # if no items specified, I guess this is right.
|
|
|
|
my $current = $self;
|
|
# the loop starts by looking at the given element
|
|
|
|
if (scalar @_ == 1) {
|
|
while ( defined $current and ref $current ) {
|
|
return 1 if $current eq $_[0] || $current->{'_tag'} eq $_[0];
|
|
$current = $current->{'_parent'};
|
|
}
|
|
return 0;
|
|
} else {
|
|
my %elements = map { $_ => 1 } @_;
|
|
while ( defined $current and ref $current ) {
|
|
return 1 if $elements{$current} || $elements{ $current->{'_tag'} };
|
|
$current = $current->{'_parent'};
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
sub is_empty {
|
|
my $self = shift;
|
|
!$self->{'_content'} || !@{ $self->{'_content'} };
|
|
}
|
|
|
|
|
|
sub pindex {
|
|
my $self = shift;
|
|
|
|
my $parent = $self->{'_parent'} || return undef;
|
|
my $pc = $parent->{'_content'} || return undef;
|
|
for ( my $i = 0; $i < @$pc; ++$i ) {
|
|
return $i if ref $pc->[$i] and $pc->[$i] eq $self;
|
|
}
|
|
return undef; # we shouldn't ever get here
|
|
}
|
|
|
|
#--------------------------------------------------------------------------
|
|
|
|
|
|
sub left {
|
|
Carp::croak "left() is supposed to be an object method"
|
|
unless ref $_[0];
|
|
my $pc = ( $_[0]->{'_parent'} || return )->{'_content'}
|
|
|| die "parent is childless?";
|
|
|
|
die "parent is childless" unless @$pc;
|
|
return if @$pc == 1; # I'm an only child
|
|
|
|
if (wantarray) {
|
|
my @out;
|
|
foreach my $j (@$pc) {
|
|
return @out if ref $j and $j eq $_[0];
|
|
push @out, $j;
|
|
}
|
|
}
|
|
else {
|
|
for ( my $i = 0; $i < @$pc; ++$i ) {
|
|
return $i ? $pc->[ $i - 1 ] : undef
|
|
if ref $pc->[$i] and $pc->[$i] eq $_[0];
|
|
}
|
|
}
|
|
|
|
die "I'm not in my parent's content list?";
|
|
return;
|
|
}
|
|
|
|
|
|
sub right {
|
|
Carp::croak "right() is supposed to be an object method"
|
|
unless ref $_[0];
|
|
my $pc = ( $_[0]->{'_parent'} || return )->{'_content'}
|
|
|| die "parent is childless?";
|
|
|
|
die "parent is childless" unless @$pc;
|
|
return if @$pc == 1; # I'm an only child
|
|
|
|
if (wantarray) {
|
|
my ( @out, $seen );
|
|
foreach my $j (@$pc) {
|
|
if ($seen) {
|
|
push @out, $j;
|
|
}
|
|
else {
|
|
$seen = 1 if ref $j and $j eq $_[0];
|
|
}
|
|
}
|
|
die "I'm not in my parent's content list?" unless $seen;
|
|
return @out;
|
|
}
|
|
else {
|
|
for ( my $i = 0; $i < @$pc; ++$i ) {
|
|
return +( $i == $#$pc ) ? undef : $pc->[ $i + 1 ]
|
|
if ref $pc->[$i] and $pc->[$i] eq $_[0];
|
|
}
|
|
die "I'm not in my parent's content list?";
|
|
return;
|
|
}
|
|
}
|
|
|
|
#--------------------------------------------------------------------------
|
|
|
|
|
|
sub address {
|
|
if ( @_ == 1 ) { # report-address form
|
|
return join(
|
|
'.',
|
|
reverse( # so it starts at the top
|
|
map( $_->pindex() || '0', # so that root's undef -> '0'
|
|
$_[0], # self and...
|
|
$_[0]->lineage )
|
|
)
|
|
);
|
|
}
|
|
else { # get-node-at-address
|
|
my @stack = split( /\./, $_[1] );
|
|
my $here;
|
|
|
|
if ( @stack and !length $stack[0] ) { # relative addressing
|
|
$here = $_[0];
|
|
shift @stack;
|
|
}
|
|
else { # absolute addressing
|
|
return undef unless 0 == shift @stack; # pop the initial 0-for-root
|
|
$here = $_[0]->root;
|
|
}
|
|
|
|
while (@stack) {
|
|
return undef
|
|
unless $here->{'_content'}
|
|
and @{ $here->{'_content'} } > $stack[0];
|
|
|
|
# make sure the index isn't too high
|
|
$here = $here->{'_content'}[ shift @stack ];
|
|
return undef if @stack and not ref $here;
|
|
|
|
# we hit a text node when we expected a non-terminal element node
|
|
}
|
|
|
|
return $here;
|
|
}
|
|
}
|
|
|
|
|
|
sub depth {
|
|
my $here = $_[0];
|
|
my $depth = 0;
|
|
while ( defined( $here = $here->{'_parent'} ) and ref($here) ) {
|
|
++$depth;
|
|
}
|
|
return $depth;
|
|
}
|
|
|
|
|
|
sub root {
|
|
my $here = my $root = shift;
|
|
while ( defined( $here = $here->{'_parent'} ) and ref($here) ) {
|
|
$root = $here;
|
|
}
|
|
return $root;
|
|
}
|
|
|
|
|
|
sub lineage {
|
|
my $here = shift;
|
|
my @lineage;
|
|
while ( defined( $here = $here->{'_parent'} ) and ref($here) ) {
|
|
push @lineage, $here;
|
|
}
|
|
return @lineage;
|
|
}
|
|
|
|
|
|
sub lineage_tag_names {
|
|
my $here = my $start = shift;
|
|
my @lineage_names;
|
|
while ( defined( $here = $here->{'_parent'} ) and ref($here) ) {
|
|
push @lineage_names, $here->{'_tag'};
|
|
}
|
|
return @lineage_names;
|
|
}
|
|
|
|
|
|
sub descendents { shift->descendants(@_) }
|
|
|
|
sub descendants {
|
|
my $start = shift;
|
|
if (wantarray) {
|
|
my @descendants;
|
|
$start->traverse(
|
|
[ # pre-order sub only
|
|
sub {
|
|
push( @descendants, $_[0] );
|
|
return 1;
|
|
},
|
|
undef # no post
|
|
],
|
|
1, # ignore text
|
|
);
|
|
shift @descendants; # so $self doesn't appear in the list
|
|
return @descendants;
|
|
}
|
|
else { # just returns a scalar
|
|
my $descendants = -1; # to offset $self being counted
|
|
$start->traverse(
|
|
[ # pre-order sub only
|
|
sub {
|
|
++$descendants;
|
|
return 1;
|
|
},
|
|
undef # no post
|
|
],
|
|
1, # ignore text
|
|
);
|
|
return $descendants;
|
|
}
|
|
}
|
|
|
|
|
|
sub find { shift->find_by_tag_name(@_) }
|
|
|
|
# yup, a handy alias
|
|
|
|
sub find_by_tag_name {
|
|
my (@pile) = shift(@_); # start out the to-do stack for the traverser
|
|
Carp::croak "find_by_tag_name can be called only as an object method"
|
|
unless ref $pile[0];
|
|
return () unless @_;
|
|
my (@tags) = $pile[0]->_fold_case(@_);
|
|
my ( @matching, $this, $this_tag );
|
|
while (@pile) {
|
|
$this_tag = ( $this = shift @pile )->{'_tag'};
|
|
foreach my $t (@tags) {
|
|
if ( $t eq $this_tag ) {
|
|
if (wantarray) {
|
|
push @matching, $this;
|
|
last;
|
|
}
|
|
else {
|
|
return $this;
|
|
}
|
|
}
|
|
}
|
|
unshift @pile, grep ref($_), @{ $this->{'_content'} || next };
|
|
}
|
|
return @matching if wantarray;
|
|
return;
|
|
}
|
|
|
|
|
|
sub find_by_attribute {
|
|
|
|
# We could limit this to non-internal attributes, but hey.
|
|
my ( $self, $attribute, $value ) = @_;
|
|
Carp::croak "Attribute must be a defined value!"
|
|
unless defined $attribute;
|
|
$attribute = $self->_fold_case($attribute);
|
|
|
|
my @matching;
|
|
my $wantarray = wantarray;
|
|
my $quit;
|
|
$self->traverse(
|
|
[ # pre-order only
|
|
sub {
|
|
if ( exists $_[0]{$attribute}
|
|
and $_[0]{$attribute} eq $value )
|
|
{
|
|
push @matching, $_[0];
|
|
return HTML::Element::ABORT
|
|
unless $wantarray; # only take the first
|
|
}
|
|
1; # keep traversing
|
|
},
|
|
undef # no post
|
|
],
|
|
1, # yes, ignore text nodes.
|
|
);
|
|
|
|
if ($wantarray) {
|
|
return @matching;
|
|
}
|
|
else {
|
|
return $matching[0];
|
|
}
|
|
}
|
|
|
|
#--------------------------------------------------------------------------
|
|
|
|
|
|
sub look_down {
|
|
ref( $_[0] ) or Carp::croak "look_down works only as an object method";
|
|
|
|
my @criteria;
|
|
for ( my $i = 1; $i < @_; ) {
|
|
Carp::croak "Can't use undef as an attribute name"
|
|
unless defined $_[$i];
|
|
if ( ref $_[$i] ) {
|
|
Carp::croak "A " . ref( $_[$i] ) . " value is not a criterion"
|
|
unless ref $_[$i] eq 'CODE';
|
|
push @criteria, $_[ $i++ ];
|
|
}
|
|
else {
|
|
Carp::croak "param list to look_down ends in a key!" if $i == $#_;
|
|
push @criteria, [
|
|
scalar( $_[0]->_fold_case( $_[$i] ) ),
|
|
defined( $_[ $i + 1 ] )
|
|
? ( ( ref $_[ $i + 1 ] ? $_[ $i + 1 ] : lc( $_[ $i + 1 ] ) ),
|
|
ref( $_[ $i + 1 ] )
|
|
)
|
|
|
|
# yes, leave that LC!
|
|
: undef
|
|
];
|
|
$i += 2;
|
|
}
|
|
}
|
|
Carp::croak "No criteria?" unless @criteria;
|
|
|
|
my (@pile) = ( $_[0] );
|
|
my ( @matching, $val, $this );
|
|
Node:
|
|
while ( defined( $this = shift @pile ) ) {
|
|
|
|
# Yet another traverser implemented with merely iterative code.
|
|
foreach my $c (@criteria) {
|
|
if ( ref($c) eq 'CODE' ) {
|
|
next Node unless $c->($this); # jump to the continue block
|
|
}
|
|
else { # it's an attr-value pair
|
|
next Node # jump to the continue block
|
|
if # two values are unequal if:
|
|
( defined( $val = $this->{ $c->[0] } ) )
|
|
? ( !defined $c->[ 1
|
|
] # actual is def, critval is undef => fail
|
|
# allow regex matching
|
|
# allow regex matching
|
|
or (
|
|
$c->[2] eq 'Regexp'
|
|
? $val !~ $c->[1]
|
|
: ( ref $val ne $c->[2]
|
|
|
|
# have unequal ref values => fail
|
|
or lc($val) ne lc( $c->[1] )
|
|
|
|
# have unequal lc string values => fail
|
|
)
|
|
)
|
|
)
|
|
: ( defined $c->[1]
|
|
) # actual is undef, critval is def => fail
|
|
}
|
|
}
|
|
|
|
# We make it this far only if all the criteria passed.
|
|
return $this unless wantarray;
|
|
push @matching, $this;
|
|
}
|
|
continue {
|
|
unshift @pile, grep ref($_), @{ $this->{'_content'} || $nillio };
|
|
}
|
|
return @matching if wantarray;
|
|
return;
|
|
}
|
|
|
|
|
|
sub look_up {
|
|
ref( $_[0] ) or Carp::croak "look_up works only as an object method";
|
|
|
|
my @criteria;
|
|
for ( my $i = 1; $i < @_; ) {
|
|
Carp::croak "Can't use undef as an attribute name"
|
|
unless defined $_[$i];
|
|
if ( ref $_[$i] ) {
|
|
Carp::croak "A " . ref( $_[$i] ) . " value is not a criterion"
|
|
unless ref $_[$i] eq 'CODE';
|
|
push @criteria, $_[ $i++ ];
|
|
}
|
|
else {
|
|
Carp::croak "param list to look_up ends in a key!" if $i == $#_;
|
|
push @criteria, [
|
|
scalar( $_[0]->_fold_case( $_[$i] ) ),
|
|
defined( $_[ $i + 1 ] )
|
|
? ( ( ref $_[ $i + 1 ] ? $_[ $i + 1 ] : lc( $_[ $i + 1 ] ) ),
|
|
ref( $_[ $i + 1 ] )
|
|
)
|
|
: undef # Yes, leave that LC!
|
|
];
|
|
$i += 2;
|
|
}
|
|
}
|
|
Carp::croak "No criteria?" unless @criteria;
|
|
|
|
my ( @matching, $val );
|
|
my $this = $_[0];
|
|
Node:
|
|
while (1) {
|
|
|
|
# You'll notice that the code here is almost the same as for look_down.
|
|
foreach my $c (@criteria) {
|
|
if ( ref($c) eq 'CODE' ) {
|
|
next Node unless $c->($this); # jump to the continue block
|
|
}
|
|
else { # it's an attr-value pair
|
|
next Node # jump to the continue block
|
|
if # two values are unequal if:
|
|
( defined( $val = $this->{ $c->[0] } ) )
|
|
? ( !defined $c->[ 1
|
|
] # actual is def, critval is undef => fail
|
|
or (
|
|
$c->[2] eq 'Regexp'
|
|
? $val !~ $c->[1]
|
|
: ( ref $val ne $c->[2]
|
|
|
|
# have unequal ref values => fail
|
|
or lc($val) ne $c->[1]
|
|
|
|
# have unequal lc string values => fail
|
|
)
|
|
)
|
|
)
|
|
: ( defined $c->[1]
|
|
) # actual is undef, critval is def => fail
|
|
}
|
|
}
|
|
|
|
# We make it this far only if all the criteria passed.
|
|
return $this unless wantarray;
|
|
push @matching, $this;
|
|
}
|
|
continue {
|
|
last unless defined( $this = $this->{'_parent'} ) and ref $this;
|
|
}
|
|
|
|
return @matching if wantarray;
|
|
return;
|
|
}
|
|
|
|
#--------------------------------------------------------------------------
|
|
|
|
|
|
sub attr_get_i {
|
|
if ( @_ > 2 ) {
|
|
my $self = shift;
|
|
Carp::croak "No attribute names can be undef!"
|
|
if grep !defined($_), @_;
|
|
my @attributes = $self->_fold_case(@_);
|
|
if (wantarray) {
|
|
my @out;
|
|
foreach my $x ( $self, $self->lineage ) {
|
|
push @out,
|
|
map { exists( $x->{$_} ) ? $x->{$_} : () } @attributes;
|
|
}
|
|
return @out;
|
|
}
|
|
else {
|
|
foreach my $x ( $self, $self->lineage ) {
|
|
foreach my $attribute (@attributes) {
|
|
return $x->{$attribute}
|
|
if exists $x->{$attribute}; # found
|
|
}
|
|
}
|
|
return; # never found
|
|
}
|
|
}
|
|
else {
|
|
|
|
# Single-attribute search. Simpler, most common, so optimize
|
|
# for the most common case
|
|
Carp::croak "Attribute name must be a defined value!"
|
|
unless defined $_[1];
|
|
my $self = $_[0];
|
|
my $attribute = $self->_fold_case( $_[1] );
|
|
if (wantarray) { # list context
|
|
return
|
|
map { exists( $_->{$attribute} ) ? $_->{$attribute} : () }
|
|
$self, $self->lineage;
|
|
}
|
|
else { # scalar context
|
|
foreach my $x ( $self, $self->lineage ) {
|
|
return $x->{$attribute} if exists $x->{$attribute}; # found
|
|
}
|
|
return; # never found
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
sub tagname_map {
|
|
my (@pile) = $_[0]; # start out the to-do stack for the traverser
|
|
Carp::croak "find_by_tag_name can be called only as an object method"
|
|
unless ref $pile[0];
|
|
my ( %map, $this_tag, $this );
|
|
while (@pile) {
|
|
$this_tag = ''
|
|
unless defined( $this_tag = ( $this = shift @pile )->{'_tag'} )
|
|
; # dance around the strange case of having an undef tagname.
|
|
push @{ $map{$this_tag} ||= [] }, $this; # add to map
|
|
unshift @pile, grep ref($_),
|
|
@{ $this->{'_content'} || next }; # traverse
|
|
}
|
|
return \%map;
|
|
}
|
|
|
|
|
|
sub extract_links {
|
|
my $start = shift;
|
|
|
|
my %wantType;
|
|
@wantType{ $start->_fold_case(@_) } = (1) x @_; # if there were any
|
|
my $wantType = scalar(@_);
|
|
|
|
my @links;
|
|
|
|
# TODO: add xml:link?
|
|
|
|
my ( $link_attrs, $tag, $self, $val ); # scratch for each iteration
|
|
$start->traverse(
|
|
[ sub { # pre-order call only
|
|
$self = $_[0];
|
|
|
|
$tag = $self->{'_tag'};
|
|
return 1
|
|
if $wantType && !$wantType{$tag}; # if we're selective
|
|
|
|
if (defined(
|
|
$link_attrs = $HTML::Element::linkElements{$tag}
|
|
)
|
|
)
|
|
{
|
|
|
|
# If this is a tag that has any link attributes,
|
|
# look over possibly present link attributes,
|
|
# saving the value, if found.
|
|
for ( ref($link_attrs) ? @$link_attrs : $link_attrs ) {
|
|
if ( defined( $val = $self->attr($_) ) ) {
|
|
push( @links, [ $val, $self, $_, $tag ] );
|
|
}
|
|
}
|
|
}
|
|
1; # return true, so we keep recursing
|
|
},
|
|
undef
|
|
],
|
|
1, # ignore text nodes
|
|
);
|
|
\@links;
|
|
}
|
|
|
|
|
|
sub simplify_pres {
|
|
my $pre = 0;
|
|
|
|
my $sub;
|
|
my $line;
|
|
$sub = sub {
|
|
++$pre if $_[0]->{'_tag'} eq 'pre';
|
|
foreach my $it ( @{ $_[0]->{'_content'} || return } ) {
|
|
if ( ref $it ) {
|
|
$sub->($it); # recurse!
|
|
}
|
|
elsif ($pre) {
|
|
|
|
#$it =~ s/(?:(?:\cm\cj*)|(?:\cj))/\n/g;
|
|
|
|
$it = join "\n", map {
|
|
;
|
|
$line = $_;
|
|
while (
|
|
$line
|
|
=~ s/^([^\t]*)(\t+)/$1.(" " x ((length($2)<<3)-(length($1)&7)))/e
|
|
|
|
# Sort of adapted from Text::Tabs -- yes, it's hardwired-in that
|
|
# tabs are at every EIGHTH column.
|
|
)
|
|
{
|
|
}
|
|
$line;
|
|
}
|
|
split /(?:(?:\cm\cj*)|(?:\cj))/, $it, -1;
|
|
}
|
|
}
|
|
--$pre if $_[0]->{'_tag'} eq 'pre';
|
|
return;
|
|
};
|
|
$sub->( $_[0] );
|
|
|
|
undef $sub;
|
|
return;
|
|
}
|
|
|
|
|
|
sub same_as {
|
|
die 'same_as() takes only one argument: $h->same_as($i)' unless @_ == 2;
|
|
my ( $h, $i ) = @_[ 0, 1 ];
|
|
die "same_as() can be called only as an object method" unless ref $h;
|
|
|
|
return 0 unless defined $i and ref $i;
|
|
|
|
# An element can't be same_as anything but another element!
|
|
# They needn't be of the same class, tho.
|
|
|
|
return 1 if $h eq $i;
|
|
|
|
# special (if rare) case: anything is the same as... itself!
|
|
|
|
# assumes that no content lists in/under $h or $i contain subsequent
|
|
# text segments, like: ['foo', ' bar']
|
|
|
|
# compare attributes now.
|
|
#print "Comparing tags of $h and $i...\n";
|
|
|
|
return 0 unless $h->{'_tag'} eq $i->{'_tag'};
|
|
|
|
# only significant attribute whose name starts with "_"
|
|
|
|
#print "Comparing attributes of $h and $i...\n";
|
|
# Compare attributes, but only the real ones.
|
|
{
|
|
|
|
# Bear in mind that the average element has very few attributes,
|
|
# and that element names are rather short.
|
|
# (Values are a different story.)
|
|
|
|
# XXX I would think that /^[^_]/ would be faster, at least easier to read.
|
|
my @keys_h
|
|
= sort grep { length $_ and substr( $_, 0, 1 ) ne '_' } keys %$h;
|
|
my @keys_i
|
|
= sort grep { length $_ and substr( $_, 0, 1 ) ne '_' } keys %$i;
|
|
|
|
return 0 unless @keys_h == @keys_i;
|
|
|
|
# different number of real attributes? they're different.
|
|
for ( my $x = 0; $x < @keys_h; ++$x ) {
|
|
return 0
|
|
unless $keys_h[$x] eq $keys_i[$x] and # same key name
|
|
$h->{ $keys_h[$x] } eq $i->{ $keys_h[$x] }; # same value
|
|
# Should this test for definedness on values?
|
|
# People shouldn't be putting undef in attribute values, I think.
|
|
}
|
|
}
|
|
|
|
#print "Comparing children of $h and $i...\n";
|
|
my $hcl = $h->{'_content'} || [];
|
|
my $icl = $i->{'_content'} || [];
|
|
|
|
return 0 unless @$hcl == @$icl;
|
|
|
|
# different numbers of children? they're different.
|
|
|
|
if (@$hcl) {
|
|
|
|
# compare each of the children:
|
|
for ( my $x = 0; $x < @$hcl; ++$x ) {
|
|
if ( ref $hcl->[$x] ) {
|
|
return 0 unless ref( $icl->[$x] );
|
|
|
|
# an element can't be the same as a text segment
|
|
# Both elements:
|
|
return 0 unless $hcl->[$x]->same_as( $icl->[$x] ); # RECURSE!
|
|
}
|
|
else {
|
|
return 0 if ref( $icl->[$x] );
|
|
|
|
# a text segment can't be the same as an element
|
|
# Both text segments:
|
|
return 0 unless $hcl->[$x] eq $icl->[$x];
|
|
}
|
|
}
|
|
}
|
|
|
|
return 1; # passed all the tests!
|
|
}
|
|
|
|
|
|
sub new_from_lol {
|
|
my $class = shift;
|
|
$class = ref($class) || $class;
|
|
|
|
# calling as an object method is just the same as ref($h)->new_from_lol(...)
|
|
my $lol = $_[1];
|
|
|
|
my @ancestor_lols;
|
|
|
|
# So we can make sure there's no cyclicities in this lol.
|
|
# That would be perverse, but one never knows.
|
|
my ( $sub, $k, $v, $node ); # last three are scratch values
|
|
$sub = sub {
|
|
|
|
#print "Building for $_[0]\n";
|
|
my $lol = $_[0];
|
|
return unless @$lol;
|
|
my ( @attributes, @children );
|
|
Carp::croak "Cyclicity detected in source LOL tree, around $lol?!?"
|
|
if grep( $_ eq $lol, @ancestor_lols );
|
|
push @ancestor_lols, $lol;
|
|
|
|
my $tag_name = 'null';
|
|
|
|
# Recursion in in here:
|
|
for ( my $i = 0; $i < @$lol; ++$i ) { # Iterate over children
|
|
if ( ref( $lol->[$i] ) eq 'ARRAY' )
|
|
{ # subtree: most common thing in loltree
|
|
push @children, $sub->( $lol->[$i] );
|
|
}
|
|
elsif ( !ref( $lol->[$i] ) ) {
|
|
if ( $i == 0 ) { # name
|
|
$tag_name = $lol->[$i];
|
|
Carp::croak "\"$tag_name\" isn't a good tag name!"
|
|
if $tag_name =~ m/[<>\/\x00-\x20]/
|
|
; # minimal sanity, certainly!
|
|
}
|
|
else { # text segment child
|
|
push @children, $lol->[$i];
|
|
}
|
|
}
|
|
elsif ( ref( $lol->[$i] ) eq 'HASH' ) { # attribute hashref
|
|
keys %{ $lol->[$i] }; # reset the each-counter, just in case
|
|
while ( ( $k, $v ) = each %{ $lol->[$i] } ) {
|
|
push @attributes, $class->_fold_case($k), $v
|
|
if defined $v
|
|
and $k ne '_name'
|
|
and $k ne '_content'
|
|
and $k ne '_parent';
|
|
|
|
# enforce /some/ sanity!
|
|
}
|
|
}
|
|
elsif ( UNIVERSAL::isa( $lol->[$i], __PACKAGE__ ) ) {
|
|
if ( $lol->[$i]->{'_parent'} ) { # if claimed
|
|
#print "About to clone ", $lol->[$i], "\n";
|
|
push @children, $lol->[$i]->clone();
|
|
}
|
|
else {
|
|
push @children, $lol->[$i]; # if unclaimed...
|
|
#print "Claiming ", $lol->[$i], "\n";
|
|
$lol->[$i]->{'_parent'} = 1; # claim it NOW
|
|
# This WILL be replaced by the correct value once we actually
|
|
# construct the parent, just after the end of this loop...
|
|
}
|
|
}
|
|
else {
|
|
Carp::croak "new_from_lol doesn't handle references of type "
|
|
. ref( $lol->[$i] );
|
|
}
|
|
}
|
|
|
|
pop @ancestor_lols;
|
|
$node = $class->new($tag_name);
|
|
|
|
#print "Children: @children\n";
|
|
|
|
if ( $class eq __PACKAGE__ ) { # Special-case it, for speed:
|
|
%$node = ( %$node, @attributes ) if @attributes;
|
|
|
|
#print join(' ', $node, ' ' , map("<$_>", %$node), "\n");
|
|
if (@children) {
|
|
$node->{'_content'} = \@children;
|
|
foreach my $c (@children) {
|
|
_weaken($c->{'_parent'} = $node)
|
|
if ref $c;
|
|
}
|
|
}
|
|
}
|
|
else { # Do it the clean way...
|
|
#print "Done neatly\n";
|
|
while (@attributes) { $node->attr( splice @attributes, 0, 2 ) }
|
|
$node->push_content(
|
|
map { _weaken($_->{'_parent'} = $node) if ref $_; $_ }
|
|
@children )
|
|
if @children;
|
|
}
|
|
|
|
return $node;
|
|
};
|
|
|
|
# End of sub definition.
|
|
|
|
if (wantarray) {
|
|
my (@nodes) = map { ; ( ref($_) eq 'ARRAY' ) ? $sub->($_) : $_ } @_;
|
|
# Let text bits pass thru, I guess. This makes this act more like
|
|
# unshift_content et al. Undocumented.
|
|
|
|
undef $sub;
|
|
# so it won't be in its own frame, so its refcount can hit 0
|
|
|
|
return @nodes;
|
|
}
|
|
else {
|
|
Carp::croak "new_from_lol in scalar context needs exactly one lol"
|
|
unless @_ == 1;
|
|
return $_[0] unless ref( $_[0] ) eq 'ARRAY';
|
|
# used to be a fatal error. still undocumented tho.
|
|
|
|
$node = $sub->( $_[0] );
|
|
undef $sub;
|
|
# so it won't be in its own frame, so its refcount can hit 0
|
|
|
|
return $node;
|
|
}
|
|
}
|
|
|
|
|
|
sub objectify_text {
|
|
my (@stack) = ( $_[0] );
|
|
|
|
my ($this);
|
|
while (@stack) {
|
|
foreach my $c ( @{ ( $this = shift @stack )->{'_content'} } ) {
|
|
if ( ref($c) ) {
|
|
unshift @stack, $c; # visit it later.
|
|
}
|
|
else {
|
|
$c = $this->element_class->new(
|
|
'~text',
|
|
'text' => $c,
|
|
'_parent' => $this
|
|
);
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
sub deobjectify_text {
|
|
my (@stack) = ( $_[0] );
|
|
my ($old_node);
|
|
|
|
if ( $_[0]{'_tag'} eq '~text' ) { # special case
|
|
# Puts the $old_node variable to a different purpose
|
|
if ( $_[0]{'_parent'} ) {
|
|
$_[0]->replace_with( $old_node = delete $_[0]{'text'} )->delete;
|
|
}
|
|
else { # well, that's that, then!
|
|
$old_node = delete $_[0]{'text'};
|
|
}
|
|
|
|
if ( ref( $_[0] ) eq __PACKAGE__ ) { # common case
|
|
%{ $_[0] } = (); # poof!
|
|
}
|
|
else {
|
|
|
|
# play nice:
|
|
delete $_[0]{'_parent'};
|
|
$_[0]->delete;
|
|
}
|
|
return '' unless defined $old_node; # sanity!
|
|
return $old_node;
|
|
}
|
|
|
|
while (@stack) {
|
|
foreach my $c ( @{ ( shift @stack )->{'_content'} } ) {
|
|
if ( ref($c) ) {
|
|
if ( $c->{'_tag'} eq '~text' ) {
|
|
$c = ( $old_node = $c )->{'text'};
|
|
if ( ref($old_node) eq __PACKAGE__ ) { # common case
|
|
%$old_node = (); # poof!
|
|
}
|
|
else {
|
|
|
|
# play nice:
|
|
delete $old_node->{'_parent'};
|
|
$old_node->delete;
|
|
}
|
|
}
|
|
else {
|
|
unshift @stack, $c; # visit it later.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return undef;
|
|
}
|
|
|
|
|
|
{
|
|
|
|
# The next three subs are basically copied from Number::Latin,
|
|
# based on a one-liner by Abigail. Yes, I could simply require that
|
|
# module, and a Roman numeral module too, but really, HTML-Tree already
|
|
# has enough dependecies as it is; and anyhow, I don't need the functions
|
|
# that do latin2int or roman2int.
|
|
no integer;
|
|
|
|
sub _int2latin {
|
|
return unless defined $_[0];
|
|
return '0' if $_[0] < 1 and $_[0] > -1;
|
|
return '-' . _i2l( abs int $_[0] )
|
|
if $_[0] <= -1; # tolerate negatives
|
|
return _i2l( int $_[0] );
|
|
}
|
|
|
|
sub _int2LATIN {
|
|
|
|
# just the above plus uc
|
|
return unless defined $_[0];
|
|
return '0' if $_[0] < 1 and $_[0] > -1;
|
|
return '-' . uc( _i2l( abs int $_[0] ) )
|
|
if $_[0] <= -1; # tolerate negs
|
|
return uc( _i2l( int $_[0] ) );
|
|
}
|
|
|
|
my @alpha = ( 'a' .. 'z' );
|
|
|
|
sub _i2l { # the real work
|
|
my $int = $_[0] || return "";
|
|
_i2l( int( ( $int - 1 ) / 26 ) )
|
|
. $alpha[ $int % 26 - 1 ]; # yes, recursive
|
|
# Yes, 26 => is (26 % 26 - 1), which is -1 => Z!
|
|
}
|
|
}
|
|
|
|
{
|
|
|
|
# And now, some much less impressive Roman numerals code:
|
|
|
|
my (@i) = ( '', qw(I II III IV V VI VII VIII IX) );
|
|
my (@x) = ( '', qw(X XX XXX XL L LX LXX LXXX XC) );
|
|
my (@c) = ( '', qw(C CC CCC CD D DC DCC DCCC CM) );
|
|
my (@m) = ( '', qw(M MM MMM) );
|
|
|
|
sub _int2ROMAN {
|
|
my ( $i, $pref );
|
|
return '0'
|
|
if 0 == ( $i = int( $_[0] || 0 ) ); # zero is a special case
|
|
return $i + 0 if $i <= -4000 or $i >= 4000;
|
|
|
|
# Because over 3999 would require non-ASCII chars, like D-with-)-inside
|
|
if ( $i < 0 ) { # grumble grumble tolerate negatives grumble
|
|
$pref = '-';
|
|
$i = abs($i);
|
|
}
|
|
else {
|
|
$pref = ''; # normal case
|
|
}
|
|
|
|
my ( $x, $c, $m ) = ( 0, 0, 0 );
|
|
if ( $i >= 10 ) {
|
|
$x = $i / 10;
|
|
$i %= 10;
|
|
if ( $x >= 10 ) {
|
|
$c = $x / 10;
|
|
$x %= 10;
|
|
if ( $c >= 10 ) { $m = $c / 10; $c %= 10; }
|
|
}
|
|
}
|
|
|
|
#print "m$m c$c x$x i$i\n";
|
|
|
|
return join( '', $pref, $m[$m], $c[$c], $x[$x], $i[$i] );
|
|
}
|
|
|
|
sub _int2roman { lc( _int2ROMAN( $_[0] ) ) }
|
|
}
|
|
|
|
sub _int2int { $_[0] } # dummy
|
|
|
|
%list_type_to_sub = (
|
|
'I' => \&_int2ROMAN,
|
|
'i' => \&_int2roman,
|
|
'A' => \&_int2LATIN,
|
|
'a' => \&_int2latin,
|
|
'1' => \&_int2int,
|
|
);
|
|
|
|
sub number_lists {
|
|
my (@stack) = ( $_[0] );
|
|
my ( $this, $tag, $counter, $numberer ); # scratch
|
|
while (@stack) { # yup, pre-order-traverser idiom
|
|
if ( ( $tag = ( $this = shift @stack )->{'_tag'} ) eq 'ol' ) {
|
|
|
|
# Prep some things:
|
|
$counter
|
|
= ( ( $this->{'start'} || '' ) =~ m<^\s*(\d{1,7})\s*$>s )
|
|
? $1
|
|
: 1;
|
|
$numberer = $list_type_to_sub{ $this->{'type'} || '' }
|
|
|| $list_type_to_sub{'1'};
|
|
|
|
# Immeditately iterate over all children
|
|
foreach my $c ( @{ $this->{'_content'} || next } ) {
|
|
next unless ref $c;
|
|
unshift @stack, $c;
|
|
if ( $c->{'_tag'} eq 'li' ) {
|
|
$counter = $1
|
|
if (
|
|
( $c->{'value'} || '' ) =~ m<^\s*(\d{1,7})\s*$>s );
|
|
$c->{'_bullet'} = $numberer->($counter) . '.';
|
|
++$counter;
|
|
}
|
|
}
|
|
|
|
}
|
|
elsif ( $tag eq 'ul' or $tag eq 'dir' or $tag eq 'menu' ) {
|
|
|
|
# Immeditately iterate over all children
|
|
foreach my $c ( @{ $this->{'_content'} || next } ) {
|
|
next unless ref $c;
|
|
unshift @stack, $c;
|
|
$c->{'_bullet'} = '*' if $c->{'_tag'} eq 'li';
|
|
}
|
|
|
|
}
|
|
else {
|
|
foreach my $c ( @{ $this->{'_content'} || next } ) {
|
|
unshift @stack, $c if ref $c;
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
|
|
sub has_insane_linkage {
|
|
my @pile = ( $_[0] );
|
|
my ( $c, $i, $p, $this ); # scratch
|
|
|
|
# Another iterative traverser; this time much simpler because
|
|
# only in pre-order:
|
|
my %parent_of = ( $_[0], 'TOP-OF-SCAN' );
|
|
while (@pile) {
|
|
$this = shift @pile;
|
|
$c = $this->{'_content'} || next;
|
|
return ( $this, "_content attribute is true but nonref." )
|
|
unless ref($c) eq 'ARRAY';
|
|
next unless @$c;
|
|
for ( $i = 0; $i < @$c; ++$i ) {
|
|
return ( $this, "Child $i is undef" )
|
|
unless defined $c->[$i];
|
|
if ( ref( $c->[$i] ) ) {
|
|
return ( $c->[$i], "appears in its own content list" )
|
|
if $c->[$i] eq $this;
|
|
return ( $c->[$i],
|
|
"appears twice in the tree: once under $this, once under $parent_of{$c->[$i]}"
|
|
) if exists $parent_of{ $c->[$i] };
|
|
$parent_of{ $c->[$i] } = '' . $this;
|
|
|
|
# might as well just use the stringification of it.
|
|
|
|
return ( $c->[$i],
|
|
"_parent attribute is wrong (not defined)" )
|
|
unless defined( $p = $c->[$i]{'_parent'} );
|
|
return ( $c->[$i], "_parent attribute is wrong (nonref)" )
|
|
unless ref($p);
|
|
return ( $c->[$i],
|
|
"_parent attribute is wrong (is $p; should be $this)" )
|
|
unless $p eq $this;
|
|
}
|
|
}
|
|
unshift @pile, grep ref($_), @$c;
|
|
|
|
# queue up more things on the pile stack
|
|
}
|
|
return; #okay
|
|
}
|
|
|
|
sub _asserts_fail { # to be run on trusted documents only
|
|
my (@pile) = ( $_[0] );
|
|
my ( @errors, $this, $id, $assert, $parent, $rv );
|
|
while (@pile) {
|
|
$this = shift @pile;
|
|
if ( defined( $assert = $this->{'assert'} ) ) {
|
|
$id = ( $this->{'id'} ||= $this->address )
|
|
; # don't use '0' as an ID, okay?
|
|
unless ( ref($assert) ) {
|
|
|
|
package main;
|
|
## no critic
|
|
$assert = $this->{'assert'} = (
|
|
$assert =~ m/\bsub\b/
|
|
? eval($assert)
|
|
: eval("sub { $assert\n}")
|
|
);
|
|
## use critic
|
|
if ($@) {
|
|
push @errors,
|
|
[ $this, "assertion at $id broke in eval: $@" ];
|
|
$assert = $this->{'assert'} = sub { };
|
|
}
|
|
}
|
|
$parent = $this->{'_parent'};
|
|
$rv = undef;
|
|
eval {
|
|
$rv = $assert->(
|
|
$this, $this->{'_tag'}, $this->{'_id'}, # 0,1,2
|
|
$parent
|
|
? ( $parent, $parent->{'_tag'}, $parent->{'id'} )
|
|
: () # 3,4,5
|
|
);
|
|
};
|
|
if ($@) {
|
|
push @errors, [ $this, "assertion at $id died: $@" ];
|
|
}
|
|
elsif ( !$rv ) {
|
|
push @errors, [ $this, "assertion at $id failed" ];
|
|
}
|
|
|
|
# else OK
|
|
}
|
|
push @pile, grep ref($_), @{ $this->{'_content'} || next };
|
|
}
|
|
return @errors;
|
|
}
|
|
|
|
## _valid_name
|
|
# validate XML style attribute names
|
|
# http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
|
|
|
|
sub _valid_name {
|
|
my $self = shift;
|
|
my $attr = shift
|
|
or Carp::croak("sub valid_name requires an attribute name");
|
|
|
|
return (0) unless ( $attr =~ /^$START_CHAR$NAME_CHAR+$/ );
|
|
|
|
return (1);
|
|
}
|
|
|
|
|
|
sub element_class {
|
|
$_[0]->{_element_class} || __PACKAGE__;
|
|
}
|
|
|
|
1;
|
|
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=pod
|
|
|
|
=head1 NAME
|
|
|
|
HTML::Element - Class for objects that represent HTML elements
|
|
|
|
=head1 VERSION
|
|
|
|
This document describes version 5.07 of
|
|
HTML::Element, released August 31, 2017
|
|
as part of L<HTML-Tree|HTML::Tree>.
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use HTML::Element;
|
|
$a = HTML::Element->new('a', href => 'http://www.perl.com/');
|
|
$a->push_content("The Perl Homepage");
|
|
|
|
$tag = $a->tag;
|
|
print "$tag starts out as:", $a->starttag, "\n";
|
|
print "$tag ends as:", $a->endtag, "\n";
|
|
print "$tag\'s href attribute is: ", $a->attr('href'), "\n";
|
|
|
|
$links_r = $a->extract_links();
|
|
print "Hey, I found ", scalar(@$links_r), " links.\n";
|
|
|
|
print "And that, as HTML, is: ", $a->as_HTML, "\n";
|
|
$a = $a->delete;
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
(This class is part of the L<HTML::Tree|HTML::Tree> dist.)
|
|
|
|
Objects of the HTML::Element class can be used to represent elements
|
|
of HTML document trees. These objects have attributes, notably attributes that
|
|
designates each element's parent and content. The content is an array
|
|
of text segments and other HTML::Element objects. A tree with HTML::Element
|
|
objects as nodes can represent the syntax tree for a HTML document.
|
|
|
|
=head1 HOW WE REPRESENT TREES
|
|
|
|
Consider this HTML document:
|
|
|
|
<html lang='en-US'>
|
|
<head>
|
|
<title>Stuff</title>
|
|
<meta name='author' content='Jojo'>
|
|
</head>
|
|
<body>
|
|
<h1>I like potatoes!</h1>
|
|
</body>
|
|
</html>
|
|
|
|
Building a syntax tree out of it makes a tree-structure in memory
|
|
that could be diagrammed as:
|
|
|
|
html (lang='en-US')
|
|
/ \
|
|
/ \
|
|
/ \
|
|
head body
|
|
/\ \
|
|
/ \ \
|
|
/ \ \
|
|
title meta h1
|
|
| (name='author', |
|
|
"Stuff" content='Jojo') "I like potatoes"
|
|
|
|
This is the traditional way to diagram a tree, with the "root" at the
|
|
top, and it's this kind of diagram that people have in mind when they
|
|
say, for example, that "the meta element is under the head element
|
|
instead of under the body element". (The same is also said with
|
|
"inside" instead of "under" -- the use of "inside" makes more sense
|
|
when you're looking at the HTML source.)
|
|
|
|
Another way to represent the above tree is with indenting:
|
|
|
|
html (attributes: lang='en-US')
|
|
head
|
|
title
|
|
"Stuff"
|
|
meta (attributes: name='author' content='Jojo')
|
|
body
|
|
h1
|
|
"I like potatoes"
|
|
|
|
Incidentally, diagramming with indenting works much better for very
|
|
large trees, and is easier for a program to generate. The C<< $tree->dump >>
|
|
method uses indentation just that way.
|
|
|
|
However you diagram the tree, it's stored the same in memory -- it's a
|
|
network of objects, each of which has attributes like so:
|
|
|
|
element #1: _tag: 'html'
|
|
_parent: none
|
|
_content: [element #2, element #5]
|
|
lang: 'en-US'
|
|
|
|
element #2: _tag: 'head'
|
|
_parent: element #1
|
|
_content: [element #3, element #4]
|
|
|
|
element #3: _tag: 'title'
|
|
_parent: element #2
|
|
_content: [text segment "Stuff"]
|
|
|
|
element #4 _tag: 'meta'
|
|
_parent: element #2
|
|
_content: none
|
|
name: author
|
|
content: Jojo
|
|
|
|
element #5 _tag: 'body'
|
|
_parent: element #1
|
|
_content: [element #6]
|
|
|
|
element #6 _tag: 'h1'
|
|
_parent: element #5
|
|
_content: [text segment "I like potatoes"]
|
|
|
|
The "treeness" of the tree-structure that these elements comprise is
|
|
not an aspect of any particular object, but is emergent from the
|
|
relatedness attributes (_parent and _content) of these element-objects
|
|
and from how you use them to get from element to element.
|
|
|
|
While you could access the content of a tree by writing code that says
|
|
"access the 'src' attribute of the root's I<first> child's I<seventh>
|
|
child's I<third> child", you're more likely to have to scan the contents
|
|
of a tree, looking for whatever nodes, or kinds of nodes, you want to
|
|
do something with. The most straightforward way to look over a tree
|
|
is to "traverse" it; an HTML::Element method (C<< $h->traverse >>) is
|
|
provided for this purpose; and several other HTML::Element methods are
|
|
based on it.
|
|
|
|
(For everything you ever wanted to know about trees, and then some,
|
|
see Niklaus Wirth's I<Algorithms + Data Structures = Programs> or
|
|
Donald Knuth's I<The Art of Computer Programming, Volume 1>.)
|
|
|
|
=head2 Weak References
|
|
|
|
TL;DR summary: S<C<use HTML::TreeBuilder 5 -weak;>> and forget about
|
|
the C<delete> method (except for pruning a node from a tree).
|
|
|
|
Because HTML::Element stores a reference to the parent element, Perl's
|
|
reference-count garbage collection doesn't work properly with
|
|
HTML::Element trees. Starting with version 5.00, HTML::Element uses
|
|
weak references (if available) to prevent that problem. Weak
|
|
references were introduced in Perl 5.6.0, but you also need a version
|
|
of L<Scalar::Util> that provides the C<weaken> function.
|
|
|
|
Weak references are enabled by default. If you want to be certain
|
|
they're in use, you can say S<C<use HTML::Element 5 -weak;>>. You
|
|
must include the version number; previous versions of HTML::Element
|
|
ignored the import list entirely.
|
|
|
|
To disable weak references, you can say S<C<use HTML::Element -noweak;>>.
|
|
This is a global setting. B<This feature is deprecated> and is
|
|
provided only as a quick fix for broken code. If your code does not
|
|
work properly with weak references, you should fix it immediately, as
|
|
weak references may become mandatory in a future version. Generally,
|
|
all you need to do is keep a reference to the root of the tree until
|
|
you're done working with it.
|
|
|
|
Because HTML::TreeBuilder is a subclass of HTML::Element, you can also
|
|
import C<-weak> or C<-noweak> from HTML::TreeBuilder: e.g.
|
|
S<C<use HTML::TreeBuilder: 5 -weak;>>.
|
|
|
|
=head1 BASIC METHODS
|
|
|
|
=head2 new
|
|
|
|
$h = HTML::Element->new('tag', 'attrname' => 'value', ... );
|
|
|
|
This constructor method returns a new HTML::Element object. The tag
|
|
name is a required argument; it will be forced to lowercase.
|
|
Optionally, you can specify other initial attributes at object
|
|
creation time.
|
|
|
|
=head2 attr
|
|
|
|
$value = $h->attr('attr');
|
|
$old_value = $h->attr('attr', $new_value);
|
|
|
|
Returns (optionally sets) the value of the given attribute of C<$h>. The
|
|
attribute name (but not the value, if provided) is forced to
|
|
lowercase. If trying to read the value of an attribute not present
|
|
for this element, the return value is undef.
|
|
If setting a new value, the old value of that attribute is
|
|
returned.
|
|
|
|
If methods are provided for accessing an attribute (like C<< $h->tag >> for
|
|
"_tag", C<< $h->content_list >>, etc. below), use those instead of calling
|
|
attr C<< $h->attr >>, whether for reading or setting.
|
|
|
|
Note that setting an attribute to C<undef> (as opposed to "", the empty
|
|
string) actually deletes the attribute.
|
|
|
|
=head2 tag
|
|
|
|
$tagname = $h->tag();
|
|
$h->tag('tagname');
|
|
|
|
Returns (optionally sets) the tag name (also known as the generic
|
|
identifier) for the element C<$h>. In setting, the tag name is always
|
|
converted to lower case.
|
|
|
|
There are four kinds of "pseudo-elements" that show up as
|
|
HTML::Element objects:
|
|
|
|
=over
|
|
|
|
=item Comment pseudo-elements
|
|
|
|
These are element objects with a C<$h-E<gt>tag> value of "~comment",
|
|
and the content of the comment is stored in the "text" attribute
|
|
(C<$h-E<gt>attr("text")>). For example, parsing this code with
|
|
HTML::TreeBuilder...
|
|
|
|
<!-- I like Pie.
|
|
Pie is good
|
|
-->
|
|
|
|
produces an HTML::Element object with these attributes:
|
|
|
|
"_tag",
|
|
"~comment",
|
|
"text",
|
|
" I like Pie.\n Pie is good\n "
|
|
|
|
=item Declaration pseudo-elements
|
|
|
|
Declarations (rarely encountered) are represented as HTML::Element
|
|
objects with a tag name of "~declaration", and content in the "text"
|
|
attribute. For example, this:
|
|
|
|
<!DOCTYPE foo>
|
|
|
|
produces an element whose attributes include:
|
|
|
|
"_tag", "~declaration", "text", "DOCTYPE foo"
|
|
|
|
=item Processing instruction pseudo-elements
|
|
|
|
PIs (rarely encountered) are represented as HTML::Element objects with
|
|
a tag name of "~pi", and content in the "text" attribute. For
|
|
example, this:
|
|
|
|
<?stuff foo?>
|
|
|
|
produces an element whose attributes include:
|
|
|
|
"_tag", "~pi", "text", "stuff foo?"
|
|
|
|
(assuming a recent version of HTML::Parser)
|
|
|
|
=item ~literal pseudo-elements
|
|
|
|
These objects are not currently produced by HTML::TreeBuilder, but can
|
|
be used to represent a "super-literal" -- i.e., a literal you want to
|
|
be immune from escaping. (Yes, I just made that term up.)
|
|
|
|
That is, this is useful if you want to insert code into a tree that
|
|
you plan to dump out with C<as_HTML>, where you want, for some reason,
|
|
to suppress C<as_HTML>'s normal behavior of amp-quoting text segments.
|
|
|
|
For example, this:
|
|
|
|
my $literal = HTML::Element->new('~literal',
|
|
'text' => 'x < 4 & y > 7'
|
|
);
|
|
my $span = HTML::Element->new('span');
|
|
$span->push_content($literal);
|
|
print $span->as_HTML;
|
|
|
|
prints this:
|
|
|
|
<span>x < 4 & y > 7</span>
|
|
|
|
Whereas this:
|
|
|
|
my $span = HTML::Element->new('span');
|
|
$span->push_content('x < 4 & y > 7');
|
|
# normal text segment
|
|
print $span->as_HTML;
|
|
|
|
prints this:
|
|
|
|
<span>x < 4 & y > 7</span>
|
|
|
|
Unless you're inserting lots of pre-cooked code into existing trees,
|
|
and dumping them out again, it's not likely that you'll find
|
|
C<~literal> pseudo-elements useful.
|
|
|
|
=back
|
|
|
|
=head2 parent
|
|
|
|
$parent = $h->parent();
|
|
$h->parent($new_parent);
|
|
|
|
Returns (optionally sets) the parent (aka "container") for this element.
|
|
The parent should either be undef, or should be another element.
|
|
|
|
You B<should not> use this to directly set the parent of an element.
|
|
Instead use any of the other methods under "Structure-Modifying
|
|
Methods", below.
|
|
|
|
Note that C<< not($h->parent) >> is a simple test for whether C<$h> is the
|
|
root of its subtree.
|
|
|
|
=head2 content_list
|
|
|
|
@content = $h->content_list();
|
|
$num_children = $h->content_list();
|
|
|
|
Returns a list of the child nodes of this element -- i.e., what
|
|
nodes (elements or text segments) are inside/under this element. (Note
|
|
that this may be an empty list.)
|
|
|
|
In a scalar context, this returns the count of the items,
|
|
as you may expect.
|
|
|
|
=head2 content
|
|
|
|
$content_array_ref = $h->content(); # may return undef
|
|
|
|
This somewhat deprecated method returns the content of this element;
|
|
but unlike content_list, this returns either undef (which you should
|
|
understand to mean no content), or a I<reference to the array> of
|
|
content items, each of which is either a text segment (a string, i.e.,
|
|
a defined non-reference scalar value), or an HTML::Element object.
|
|
Note that even if an arrayref is returned, it may be a reference to an
|
|
empty array.
|
|
|
|
While older code should feel free to continue to use C<< $h->content >>,
|
|
new code should use C<< $h->content_list >> in almost all conceivable
|
|
cases. It is my experience that in most cases this leads to simpler
|
|
code anyway, since it means one can say:
|
|
|
|
@children = $h->content_list;
|
|
|
|
instead of the inelegant:
|
|
|
|
@children = @{$h->content || []};
|
|
|
|
If you do use C<< $h->content >> (or C<< $h->content_array_ref >>), you should not
|
|
use the reference returned by it (assuming it returned a reference,
|
|
and not undef) to directly set or change the content of an element or
|
|
text segment! Instead use L<content_refs_list> or any of the other
|
|
methods under "Structure-Modifying Methods", below.
|
|
|
|
=head2 content_array_ref
|
|
|
|
$content_array_ref = $h->content_array_ref(); # never undef
|
|
|
|
This is like C<content> (with all its caveats and deprecations) except
|
|
that it is guaranteed to return an array reference. That is, if the
|
|
given node has no C<_content> attribute, the C<content> method would
|
|
return that undef, but C<content_array_ref> would set the given node's
|
|
C<_content> value to C<[]> (a reference to a new, empty array), and
|
|
return that.
|
|
|
|
=head2 content_refs_list
|
|
|
|
@content_refs = $h->content_refs_list;
|
|
|
|
This returns a list of scalar references to each element of C<$h>'s
|
|
content list. This is useful in case you want to in-place edit any
|
|
large text segments without having to get a copy of the current value
|
|
of that segment value, modify that copy, then use the
|
|
C<splice_content> to replace the old with the new. Instead, here you
|
|
can in-place edit:
|
|
|
|
foreach my $item_r ($h->content_refs_list) {
|
|
next if ref $$item_r;
|
|
$$item_r =~ s/honour/honor/g;
|
|
}
|
|
|
|
You I<could> currently achieve the same affect with:
|
|
|
|
foreach my $item (@{ $h->content_array_ref }) {
|
|
# deprecated!
|
|
next if ref $item;
|
|
$item =~ s/honour/honor/g;
|
|
}
|
|
|
|
...except that using the return value of C<< $h->content >> or
|
|
C<< $h->content_array_ref >> to do that is deprecated, and just might stop
|
|
working in the future.
|
|
|
|
=head2 implicit
|
|
|
|
$is_implicit = $h->implicit();
|
|
$h->implicit($make_implicit);
|
|
|
|
Returns (optionally sets) the "_implicit" attribute. This attribute is
|
|
a flag that's used for indicating that the element was not originally
|
|
present in the source, but was added to the parse tree (by
|
|
HTML::TreeBuilder, for example) in order to conform to the rules of
|
|
HTML structure.
|
|
|
|
=head2 pos
|
|
|
|
$pos = $h->pos();
|
|
$h->pos($element);
|
|
|
|
Returns (and optionally sets) the "_pos" (for "current I<pos>ition")
|
|
pointer of C<$h>. This attribute is a pointer used during some
|
|
parsing operations, whose value is whatever HTML::Element element
|
|
at or under C<$h> is currently "open", where C<< $h->insert_element(NEW) >>
|
|
will actually insert a new element.
|
|
|
|
(This has nothing to do with the Perl function called C<pos>, for
|
|
controlling where regular expression matching starts.)
|
|
|
|
If you set C<< $h->pos($element) >>, be sure that C<$element> is
|
|
either C<$h>, or an element under C<$h>.
|
|
|
|
If you've been modifying the tree under C<$h> and are no longer
|
|
sure C<< $h->pos >> is valid, you can enforce validity with:
|
|
|
|
$h->pos(undef) unless $h->pos->is_inside($h);
|
|
|
|
=head2 all_attr
|
|
|
|
%attr = $h->all_attr();
|
|
|
|
Returns all this element's attributes and values, as key-value pairs.
|
|
This will include any "internal" attributes (i.e., ones not present
|
|
in the original element, and which will not be represented if/when you
|
|
call C<< $h->as_HTML >>). Internal attributes are distinguished by the fact
|
|
that the first character of their key (not value! key!) is an
|
|
underscore ("_").
|
|
|
|
Example output of C<< $h->all_attr() >> :
|
|
C<'_parent', >I<[object_value]>C< , '_tag', 'em', 'lang', 'en-US',
|
|
'_content', >I<[array-ref value]>.
|
|
|
|
=head2 all_attr_names
|
|
|
|
@names = $h->all_attr_names();
|
|
$num_attrs = $h->all_attr_names();
|
|
|
|
Like C<all_attr>, but only returns the names of the attributes.
|
|
In scalar context, returns the number of attributes.
|
|
|
|
Example output of C<< $h->all_attr_names() >> :
|
|
C<'_parent', '_tag', 'lang', '_content', >.
|
|
|
|
=head2 all_external_attr
|
|
|
|
%attr = $h->all_external_attr();
|
|
|
|
Like C<all_attr>, except that internal attributes are not present.
|
|
|
|
=head2 all_external_attr_names
|
|
|
|
@names = $h->all_external_attr_names();
|
|
$num_attrs = $h->all_external_attr_names();
|
|
|
|
Like C<all_attr_names>, except that internal attributes' names
|
|
are not present (or counted).
|
|
|
|
=head2 id
|
|
|
|
$id = $h->id();
|
|
$h->id($string);
|
|
|
|
Returns (optionally sets to C<$string>) the "id" attribute.
|
|
C<< $h->id(undef) >> deletes the "id" attribute.
|
|
|
|
C<< $h->id(...) >> is basically equivalent to C<< $h->attr('id', ...) >>,
|
|
except that when setting the attribute, this method returns the new value,
|
|
not the old value.
|
|
|
|
=head2 idf
|
|
|
|
$id = $h->idf();
|
|
$h->idf($string);
|
|
|
|
Just like the C<id> method, except that if you call C<< $h->idf() >> and
|
|
no "id" attribute is defined for this element, then it's set to a
|
|
likely-to-be-unique value, and returned. (The "f" is for "force".)
|
|
|
|
=head1 STRUCTURE-MODIFYING METHODS
|
|
|
|
These methods are provided for modifying the content of trees
|
|
by adding or changing nodes as parents or children of other nodes.
|
|
|
|
=head2 push_content
|
|
|
|
$h->push_content($element_or_text, ...);
|
|
|
|
Adds the specified items to the I<end> of the content list of the
|
|
element C<$h>. The items of content to be added should each be either a
|
|
text segment (a string), an HTML::Element object, or an arrayref.
|
|
Arrayrefs are fed thru C<< $h->new_from_lol(that_arrayref) >> to
|
|
convert them into elements, before being added to the content
|
|
list of C<$h>. This means you can say things concise things like:
|
|
|
|
$body->push_content(
|
|
['br'],
|
|
['ul',
|
|
map ['li', $_], qw(Peaches Apples Pears Mangos)
|
|
]
|
|
);
|
|
|
|
See the L</new_from_lol> method's documentation, far below, for more
|
|
explanation.
|
|
|
|
Returns C<$h> (the element itself).
|
|
|
|
The push_content method will try to consolidate adjacent text segments
|
|
while adding to the content list. That's to say, if C<$h>'s C<content_list> is
|
|
|
|
('foo bar ', $some_node, 'baz!')
|
|
|
|
and you call
|
|
|
|
$h->push_content('quack?');
|
|
|
|
then the resulting content list will be this:
|
|
|
|
('foo bar ', $some_node, 'baz!quack?')
|
|
|
|
and not this:
|
|
|
|
('foo bar ', $some_node, 'baz!', 'quack?')
|
|
|
|
If that latter is what you want, you'll have to override the
|
|
feature of consolidating text by using splice_content,
|
|
as in:
|
|
|
|
$h->splice_content(scalar($h->content_list),0,'quack?');
|
|
|
|
Similarly, if you wanted to add 'Skronk' to the beginning of
|
|
the content list, calling this:
|
|
|
|
$h->unshift_content('Skronk');
|
|
|
|
then the resulting content list will be this:
|
|
|
|
('Skronkfoo bar ', $some_node, 'baz!')
|
|
|
|
and not this:
|
|
|
|
('Skronk', 'foo bar ', $some_node, 'baz!')
|
|
|
|
What you'd to do get the latter is:
|
|
|
|
$h->splice_content(0,0,'Skronk');
|
|
|
|
=head2 unshift_content
|
|
|
|
$h->unshift_content($element_or_text, ...)
|
|
|
|
Just like C<push_content>, but adds to the I<beginning> of the C<$h>
|
|
element's content list.
|
|
|
|
The items of content to be added should each be
|
|
either a text segment (a string), an HTML::Element object, or
|
|
an arrayref (which is fed thru C<new_from_lol>).
|
|
|
|
The unshift_content method will try to consolidate adjacent text segments
|
|
while adding to the content list. See above for a discussion of this.
|
|
|
|
Returns C<$h> (the element itself).
|
|
|
|
=head2 splice_content
|
|
|
|
@removed = $h->splice_content($offset, $length,
|
|
$element_or_text, ...);
|
|
|
|
Detaches the elements from C<$h>'s list of content-nodes, starting at
|
|
C<$offset> and continuing for C<$length> items, replacing them with the
|
|
elements of the following list, if any. Returns the elements (if any)
|
|
removed from the content-list. If C<$offset> is negative, then it starts
|
|
that far from the end of the array, just like Perl's normal C<splice>
|
|
function. If C<$length> and the following list is omitted, removes
|
|
everything from C<$offset> onward.
|
|
|
|
The items of content to be added (if any) should each be either a text
|
|
segment (a string), an arrayref (which is fed thru L</new_from_lol>),
|
|
or an HTML::Element object that's not already
|
|
a child of C<$h>.
|
|
|
|
=head2 detach
|
|
|
|
$old_parent = $h->detach();
|
|
|
|
This unlinks C<$h> from its parent, by setting its 'parent' attribute to
|
|
undef, and by removing it from the content list of its parent (if it
|
|
had one). The return value is the parent that was detached from (or
|
|
undef, if C<$h> had no parent to start with). Note that neither C<$h> nor
|
|
its parent are explicitly destroyed.
|
|
|
|
=head2 detach_content
|
|
|
|
@old_content = $h->detach_content();
|
|
|
|
This unlinks all of C<$h>'s children from C<$h>, and returns them.
|
|
Note that these are not explicitly destroyed; for that, you
|
|
can just use C<< $h->delete_content >>.
|
|
|
|
=head2 replace_with
|
|
|
|
$h->replace_with( $element_or_text, ... )
|
|
|
|
This replaces C<$h> in its parent's content list with the nodes
|
|
specified. The element C<$h> (which by then may have no parent)
|
|
is returned. This causes a fatal error if C<$h> has no parent.
|
|
The list of nodes to insert may contain C<$h>, but at most once.
|
|
Aside from that possible exception, the nodes to insert should not
|
|
already be children of C<$h>'s parent.
|
|
|
|
Also, note that this method does not destroy C<$h> if weak references are
|
|
turned off -- use C<< $h->replace_with(...)->delete >> if you need that.
|
|
|
|
=head2 preinsert
|
|
|
|
$h->preinsert($element_or_text...);
|
|
|
|
Inserts the given nodes right BEFORE C<$h> in C<$h>'s parent's
|
|
content list. This causes a fatal error if C<$h> has no parent.
|
|
None of the given nodes should be C<$h> or other children of C<$h>.
|
|
Returns C<$h>.
|
|
|
|
=head2 postinsert
|
|
|
|
$h->postinsert($element_or_text...)
|
|
|
|
Inserts the given nodes right AFTER C<$h> in C<$h>'s parent's content
|
|
list. This causes a fatal error if C<$h> has no parent. None of
|
|
the given nodes should be C<$h> or other children of C<$h>. Returns
|
|
C<$h>.
|
|
|
|
=head2 replace_with_content
|
|
|
|
$h->replace_with_content();
|
|
|
|
This replaces C<$h> in its parent's content list with its own content.
|
|
The element C<$h> (which by then has no parent or content of its own) is
|
|
returned. This causes a fatal error if C<$h> has no parent. Also, note
|
|
that this does not destroy C<$h> if weak references are turned off -- use
|
|
C<< $h->replace_with_content->delete >> if you need that.
|
|
|
|
=head2 delete_content
|
|
|
|
$h->delete_content();
|
|
$h->destroy_content(); # alias
|
|
|
|
Clears the content of C<$h>, calling C<< $h->delete >> for each content
|
|
element. Compare with C<< $h->detach_content >>.
|
|
|
|
Returns C<$h>.
|
|
|
|
C<destroy_content> is an alias for this method.
|
|
|
|
=head2 delete
|
|
|
|
$h->delete();
|
|
$h->destroy(); # alias
|
|
|
|
Detaches this element from its parent (if it has one) and explicitly
|
|
destroys the element and all its descendants. The return value is
|
|
the empty list (or C<undef> in scalar context).
|
|
|
|
Before version 5.00 of HTML::Element, you had to call C<delete> when
|
|
you were finished with the tree, or your program would leak memory.
|
|
This is no longer necessary if weak references are enabled, see
|
|
L</"Weak References">.
|
|
|
|
=head2 destroy
|
|
|
|
An alias for L</delete>.
|
|
|
|
=head2 destroy_content
|
|
|
|
An alias for L</delete_content>.
|
|
|
|
=head2 clone
|
|
|
|
$copy = $h->clone();
|
|
|
|
Returns a copy of the element (whose children are clones (recursively)
|
|
of the original's children, if any).
|
|
|
|
The returned element is parentless. Any '_pos' attributes present in the
|
|
source element/tree will be absent in the copy. For that and other reasons,
|
|
the clone of an HTML::TreeBuilder object that's in mid-parse (i.e, the head
|
|
of a tree that HTML::TreeBuilder is elaborating) cannot (currently) be used
|
|
to continue the parse.
|
|
|
|
You are free to clone HTML::TreeBuilder trees, just as long as:
|
|
1) they're done being parsed, or 2) you don't expect to resume parsing
|
|
into the clone. (You can continue parsing into the original; it is
|
|
never affected.)
|
|
|
|
=head2 clone_list
|
|
|
|
@copies = HTML::Element->clone_list(...nodes...);
|
|
|
|
Returns a list consisting of a copy of each node given.
|
|
Text segments are simply copied; elements are cloned by
|
|
calling C<< $it->clone >> on each of them.
|
|
|
|
Note that this must be called as a class method, not as an instance
|
|
method. C<clone_list> will croak if called as an instance method.
|
|
You can also call it like so:
|
|
|
|
ref($h)->clone_list(...nodes...)
|
|
|
|
=head2 normalize_content
|
|
|
|
$h->normalize_content
|
|
|
|
Normalizes the content of C<$h> -- i.e., concatenates any adjacent
|
|
text nodes. (Any undefined text segments are turned into empty-strings.)
|
|
Note that this does not recurse into C<$h>'s descendants.
|
|
|
|
=head2 delete_ignorable_whitespace
|
|
|
|
$h->delete_ignorable_whitespace()
|
|
|
|
This traverses under C<$h> and deletes any text segments that are ignorable
|
|
whitespace. You should not use this if C<$h> is under a C<< <pre> >> element.
|
|
|
|
=head2 insert_element
|
|
|
|
$h->insert_element($element, $implicit);
|
|
|
|
Inserts (via push_content) a new element under the element at
|
|
C<< $h->pos() >>. Then updates C<< $h->pos() >> to point to the inserted
|
|
element, unless $element is a prototypically empty element like
|
|
C<< <br> >>, C<< <hr> >>, C<< <img> >>, etc.
|
|
The new C<< $h->pos() >> is returned. This
|
|
method is useful only if your particular tree task involves setting
|
|
C<< $h->pos() >>.
|
|
|
|
=head1 DUMPING METHODS
|
|
|
|
=head2 dump
|
|
|
|
$h->dump()
|
|
$h->dump(*FH) ; # or *FH{IO} or $fh_obj
|
|
|
|
Prints the element and all its children to STDOUT (or to a specified
|
|
filehandle), in a format useful
|
|
only for debugging. The structure of the document is shown by
|
|
indentation (no end tags).
|
|
|
|
=head2 as_HTML
|
|
|
|
$s = $h->as_HTML();
|
|
$s = $h->as_HTML($entities);
|
|
$s = $h->as_HTML($entities, $indent_char);
|
|
$s = $h->as_HTML($entities, $indent_char, \%optional_end_tags);
|
|
|
|
Returns a string representing in HTML the element and its
|
|
descendants. The optional argument C<$entities> specifies a string of
|
|
the entities to encode. For compatibility with previous versions,
|
|
specify C<'E<lt>E<gt>&'> here. If omitted or undef, I<all> unsafe
|
|
characters are encoded as HTML entities. See L<HTML::Entities> for
|
|
details. If passed an empty string, no entities are encoded.
|
|
|
|
If $indent_char is specified and defined, the HTML to be output is
|
|
intented, using the string you specify (which you probably should
|
|
set to "\t", or some number of spaces, if you specify it).
|
|
|
|
If C<\%optional_end_tags> is specified and defined, it should be
|
|
a reference to a hash that holds a true value for every tag name
|
|
whose end tag is optional. Defaults to
|
|
C<\%HTML::Element::optionalEndTag>, which is an alias to
|
|
C<%HTML::Tagset::optionalEndTag>, which, at time of writing, contains
|
|
true values for C<p, li, dt, dd>. A useful value to pass is an empty
|
|
hashref, C<{}>, which means that no end-tags are optional for this dump.
|
|
Otherwise, possibly consider copying C<%HTML::Tagset::optionalEndTag> to a
|
|
hash of your own, adding or deleting values as you like, and passing
|
|
a reference to that hash.
|
|
|
|
=head2 as_text
|
|
|
|
$s = $h->as_text();
|
|
$s = $h->as_text(skip_dels => 1);
|
|
|
|
Returns a string consisting of only the text parts of the element's
|
|
descendants. Any whitespace inside the element is included unchanged,
|
|
but whitespace not in the tree is never added. But remember that
|
|
whitespace may be ignored or compacted by HTML::TreeBuilder during
|
|
parsing (depending on the value of the C<ignore_ignorable_whitespace>
|
|
and C<no_space_compacting> attributes). Also, since whitespace is
|
|
never added during parsing,
|
|
|
|
HTML::TreeBuilder->new_from_content("<p>a</p><p>b</p>")
|
|
->as_text;
|
|
|
|
returns C<"ab">, not C<"a b"> or C<"a\nb">.
|
|
|
|
Text under C<< <script> >> or C<< <style> >> elements is never
|
|
included in what's returned. If C<skip_dels> is true, then text
|
|
content under C<< <del> >> nodes is not included in what's returned.
|
|
|
|
=head2 as_trimmed_text
|
|
|
|
$s = $h->as_trimmed_text(...);
|
|
$s = $h->as_trimmed_text(extra_chars => '\xA0'); # remove
|
|
$s = $h->as_text_trimmed(...); # alias
|
|
|
|
This is just like C<as_text(...)> except that leading and trailing
|
|
whitespace is deleted, and any internal whitespace is collapsed.
|
|
|
|
This will not remove non-breaking spaces, Unicode spaces, or any other
|
|
non-ASCII whitespace unless you supply the extra characters as
|
|
a string argument (e.g. C<< $h->as_trimmed_text(extra_chars => '\xA0') >>).
|
|
C<extra_chars> may be any string that can appear inside a character
|
|
class, including ranges like C<a-z>, POSIX character classes like
|
|
C<[:alpha:]>, and character class escapes like C<\p{Zs}>.
|
|
|
|
=head2 as_XML
|
|
|
|
$s = $h->as_XML()
|
|
|
|
Returns a string representing in XML the element and its descendants.
|
|
|
|
The XML is not indented.
|
|
|
|
=head2 as_Lisp_form
|
|
|
|
$s = $h->as_Lisp_form();
|
|
|
|
Returns a string representing the element and its descendants as a
|
|
Lisp form. Unsafe characters are encoded as octal escapes.
|
|
|
|
The Lisp form is indented, and contains external ("href", etc.) as
|
|
well as internal attributes ("_tag", "_content", "_implicit", etc.),
|
|
except for "_parent", which is omitted.
|
|
|
|
Current example output for a given element:
|
|
|
|
("_tag" "img" "border" "0" "src" "pie.png" "usemap" "#main.map")
|
|
|
|
=head2 format
|
|
|
|
$s = $h->format; # use HTML::FormatText
|
|
$s = $h->format($formatter);
|
|
|
|
Formats text output. Defaults to HTML::FormatText.
|
|
|
|
Takes a second argument that is a reference to a formatter.
|
|
|
|
=head2 starttag
|
|
|
|
$start = $h->starttag();
|
|
$start = $h->starttag($entities);
|
|
|
|
Returns a string representing the complete start tag for the element.
|
|
I.e., leading "<", tag name, attributes, and trailing ">".
|
|
All values are surrounded with
|
|
double-quotes, and appropriate characters are encoded. If C<$entities>
|
|
is omitted or undef, I<all> unsafe characters are encoded as HTML
|
|
entities. See L<HTML::Entities> for details. If you specify some
|
|
value for C<$entities>, remember to include the double-quote character in
|
|
it. (Previous versions of this module would basically behave as if
|
|
C<'&"E<gt>'> were specified for C<$entities>.) If C<$entities> is
|
|
an empty string, no entity is escaped.
|
|
|
|
=head2 starttag_XML
|
|
|
|
$start = $h->starttag_XML();
|
|
|
|
Returns a string representing the complete start tag for the element.
|
|
|
|
=head2 endtag
|
|
|
|
$end = $h->endtag();
|
|
|
|
Returns a string representing the complete end tag for this element.
|
|
I.e., "</", tag name, and ">".
|
|
|
|
=head2 endtag_XML
|
|
|
|
$end = $h->endtag_XML();
|
|
|
|
Returns a string representing the complete end tag for this element.
|
|
I.e., "</", tag name, and ">".
|
|
|
|
=head1 SECONDARY STRUCTURAL METHODS
|
|
|
|
These methods all involve some structural aspect of the tree;
|
|
either they report some aspect of the tree's structure, or they involve
|
|
traversal down the tree, or walking up the tree.
|
|
|
|
=head2 is_inside
|
|
|
|
$inside = $h->is_inside('tag', $element, ...);
|
|
|
|
Returns true if the C<$h> element is, or is contained anywhere inside an
|
|
element that is any of the ones listed, or whose tag name is any of
|
|
the tag names listed. You can use any mix of elements and tag names.
|
|
|
|
=head2 is_empty
|
|
|
|
$empty = $h->is_empty();
|
|
|
|
Returns true if C<$h> has no content, i.e., has no elements or text
|
|
segments under it. In other words, this returns true if C<$h> is a leaf
|
|
node, AKA a terminal node. Do not confuse this sense of "empty" with
|
|
another sense that it can have in SGML/HTML/XML terminology, which
|
|
means that the element in question is of the type (like HTML's C<< <hr> >>,
|
|
C<< <br> >>, C<< <img> >>, etc.) that I<can't> have any content.
|
|
|
|
That is, a particular C<< <p> >> element may happen to have no content, so
|
|
$that_p_element->is_empty will be true -- even though the prototypical
|
|
C<< <p> >> element isn't "empty" (not in the way that the prototypical
|
|
C<< <hr> >> element is).
|
|
|
|
If you think this might make for potentially confusing code, consider
|
|
simply using the clearer exact equivalent: C<< not($h->content_list) >>.
|
|
|
|
=head2 pindex
|
|
|
|
$index = $h->pindex();
|
|
|
|
Return the index of the element in its parent's contents array, such
|
|
that C<$h> would equal
|
|
|
|
$h->parent->content->[$h->pindex]
|
|
# or
|
|
($h->parent->content_list)[$h->pindex]
|
|
|
|
assuming C<$h> isn't root. If the element C<$h> is root, then
|
|
C<< $h->pindex >> returns C<undef>.
|
|
|
|
=head2 left
|
|
|
|
$element = $h->left();
|
|
@elements = $h->left();
|
|
|
|
In scalar context: returns the node that's the immediate left sibling
|
|
of C<$h>. If C<$h> is the leftmost (or only) child of its parent (or has no
|
|
parent), then this returns undef.
|
|
|
|
In list context: returns all the nodes that're the left siblings of C<$h>
|
|
(starting with the leftmost). If C<$h> is the leftmost (or only) child
|
|
of its parent (or has no parent), then this returns an empty list.
|
|
|
|
(See also C<< $h->preinsert(LIST) >>.)
|
|
|
|
=head2 right
|
|
|
|
$element = $h->right();
|
|
@elements = $h->right();
|
|
|
|
In scalar context: returns the node that's the immediate right sibling
|
|
of C<$h>. If C<$h> is the rightmost (or only) child of its parent (or has
|
|
no parent), then this returns C<undef>.
|
|
|
|
In list context: returns all the nodes that're the right siblings of
|
|
C<$h>, starting with the leftmost. If C<$h> is the rightmost (or only) child
|
|
of its parent (or has no parent), then this returns an empty list.
|
|
|
|
(See also C<< $h->postinsert(LIST) >>.)
|
|
|
|
=head2 address
|
|
|
|
$address = $h->address();
|
|
$element_or_text = $h->address($address);
|
|
|
|
The first form (with no parameter) returns a string representing the
|
|
location of C<$h> in the tree it is a member of.
|
|
The address consists of numbers joined by a '.', starting with '0',
|
|
and followed by the pindexes of the nodes in the tree that are
|
|
ancestors of C<$h>, starting from the top.
|
|
|
|
So if the way to get to a node starting at the root is to go to child
|
|
2 of the root, then child 10 of that, and then child 0 of that, and
|
|
then you're there -- then that node's address is "0.2.10.0".
|
|
|
|
As a bit of a special case, the address of the root is simply "0".
|
|
|
|
I forsee this being used mainly for debugging, but you may
|
|
find your own uses for it.
|
|
|
|
$element_or_text = $h->address($address);
|
|
|
|
This form returns the node (whether element or text-segment) at
|
|
the given address in the tree that C<$h> is a part of. (That is,
|
|
the address is resolved starting from C<< $h->root >>.)
|
|
|
|
If there is no node at the given address, this returns C<undef>.
|
|
|
|
You can specify "relative addressing" (i.e., that indexing is supposed
|
|
to start from C<$h> and not from C<< $h->root >>) by having the address start
|
|
with a period -- e.g., C<< $h->address(".3.2") >> will look at child 3 of C<$h>,
|
|
and child 2 of that.
|
|
|
|
=head2 depth
|
|
|
|
$depth = $h->depth();
|
|
|
|
Returns a number expressing C<$h>'s depth within its tree, i.e., how many
|
|
steps away it is from the root. If C<$h> has no parent (i.e., is root),
|
|
its depth is 0.
|
|
|
|
=head2 root
|
|
|
|
$root = $h->root();
|
|
|
|
Returns the element that's the top of C<$h>'s tree. If C<$h> is
|
|
root, this just returns C<$h>. (If you want to test whether C<$h>
|
|
I<is> the root, instead of asking what its root is, just test
|
|
C<< not($h->parent) >>.)
|
|
|
|
=head2 lineage
|
|
|
|
@lineage = $h->lineage();
|
|
|
|
Returns the list of C<$h>'s ancestors, starting with its parent,
|
|
and then that parent's parent, and so on, up to the root. If C<$h>
|
|
is root, this returns an empty list.
|
|
|
|
If you simply want a count of the number of elements in C<$h>'s lineage,
|
|
use C<< $h->depth >>.
|
|
|
|
=head2 lineage_tag_names
|
|
|
|
@names = $h->lineage_tag_names();
|
|
|
|
Returns the list of the tag names of C<$h>'s ancestors, starting
|
|
with its parent, and that parent's parent, and so on, up to the
|
|
root. If C<$h> is root, this returns an empty list.
|
|
Example output: C<('em', 'td', 'tr', 'table', 'body', 'html')>
|
|
|
|
Equivalent to:
|
|
|
|
map { $_->tag } $h->lineage;
|
|
|
|
=head2 descendants
|
|
|
|
@descendants = $h->descendants();
|
|
|
|
In list context, returns the list of all C<$h>'s descendant elements,
|
|
listed in pre-order (i.e., an element appears before its
|
|
content-elements). Text segments DO NOT appear in the list.
|
|
In scalar context, returns a count of all such elements.
|
|
|
|
=head2 descendents
|
|
|
|
This is just an alias to the C<descendants> method, for people who
|
|
can't spell.
|
|
|
|
=head2 find_by_tag_name
|
|
|
|
@elements = $h->find_by_tag_name('tag', ...);
|
|
$first_match = $h->find_by_tag_name('tag', ...);
|
|
|
|
In list context, returns a list of elements at or under C<$h> that have
|
|
any of the specified tag names. In scalar context, returns the first
|
|
(in pre-order traversal of the tree) such element found, or undef if
|
|
none.
|
|
|
|
=head2 find
|
|
|
|
This is just an alias to C<find_by_tag_name>. (There was once
|
|
going to be a whole find_* family of methods, but then C<look_down>
|
|
filled that niche, so there turned out not to be much reason for the
|
|
verboseness of the name "find_by_tag_name".)
|
|
|
|
=head2 find_by_attribute
|
|
|
|
@elements = $h->find_by_attribute('attribute', 'value');
|
|
$first_match = $h->find_by_attribute('attribute', 'value');
|
|
|
|
In a list context, returns a list of elements at or under C<$h> that have
|
|
the specified attribute, and have the given value for that attribute.
|
|
In a scalar context, returns the first (in pre-order traversal of the
|
|
tree) such element found, or undef if none.
|
|
|
|
This method is B<deprecated> in favor of the more expressive
|
|
C<look_down> method, which new code should use instead.
|
|
|
|
=head2 look_down
|
|
|
|
@elements = $h->look_down( ...criteria... );
|
|
$first_match = $h->look_down( ...criteria... );
|
|
|
|
This starts at C<$h> and looks thru its element descendants (in
|
|
pre-order), looking for elements matching the criteria you specify.
|
|
In list context, returns all elements that match all the given
|
|
criteria; in scalar context, returns the first such element (or undef,
|
|
if nothing matched).
|
|
|
|
There are three kinds of criteria you can specify:
|
|
|
|
=over
|
|
|
|
=item (attr_name, attr_value)
|
|
|
|
This means you're looking for an element with that value for that
|
|
attribute. Example: C<"alt", "pix!">. Consider that you can search
|
|
on internal attribute values too: C<"_tag", "p">.
|
|
|
|
=item (attr_name, qr/.../)
|
|
|
|
This means you're looking for an element whose value for that
|
|
attribute matches the specified Regexp object.
|
|
|
|
=item a coderef
|
|
|
|
This means you're looking for elements where coderef->(each_element)
|
|
returns true. Example:
|
|
|
|
my @wide_pix_images = $h->look_down(
|
|
_tag => "img",
|
|
alt => "pix!",
|
|
sub { $_[0]->attr('width') > 350 }
|
|
);
|
|
|
|
=back
|
|
|
|
Note that C<(attr_name, attr_value)> and C<(attr_name, qr/.../)>
|
|
criteria are almost always faster than coderef
|
|
criteria, so should presumably be put before them in your list of
|
|
criteria. That is, in the example above, the sub ref is called only
|
|
for elements that have already passed the criteria of having a "_tag"
|
|
attribute with value "img", and an "alt" attribute with value "pix!".
|
|
If the coderef were first, it would be called on every element, and
|
|
I<then> what elements pass that criterion (i.e., elements for which
|
|
the coderef returned true) would be checked for their "_tag" and "alt"
|
|
attributes.
|
|
|
|
Note that comparison of string attribute-values against the string
|
|
value in C<(attr_name, attr_value)> is case-INsensitive! A criterion
|
|
of C<('align', 'right')> I<will> match an element whose "align" value
|
|
is "RIGHT", or "right" or "rIGhT", etc.
|
|
|
|
Note also that C<look_down> considers "" (empty-string) and undef to
|
|
be different things, in attribute values. So this:
|
|
|
|
$h->look_down("alt", "")
|
|
|
|
will find elements I<with> an "alt" attribute, but where the value for
|
|
the "alt" attribute is "". But this:
|
|
|
|
$h->look_down("alt", undef)
|
|
|
|
is the same as:
|
|
|
|
$h->look_down(sub { !defined($_[0]->attr('alt')) } )
|
|
|
|
That is, it finds elements that do not have an "alt" attribute at all
|
|
(or that do have an "alt" attribute, but with a value of undef --
|
|
which is not normally possible).
|
|
|
|
Note that when you give several criteria, this is taken to mean you're
|
|
looking for elements that match I<all> your criterion, not just I<any>
|
|
of them. In other words, there is an implicit "and", not an "or". So
|
|
if you wanted to express that you wanted to find elements with a
|
|
"name" attribute with the value "foo" I<or> with an "id" attribute
|
|
with the value "baz", you'd have to do it like:
|
|
|
|
@them = $h->look_down(
|
|
sub {
|
|
# the lcs are to fold case
|
|
lc($_[0]->attr('name')) eq 'foo'
|
|
or lc($_[0]->attr('id')) eq 'baz'
|
|
}
|
|
);
|
|
|
|
Coderef criteria are more expressive than C<(attr_name, attr_value)>
|
|
and C<(attr_name, qr/.../)>
|
|
criteria, and all C<(attr_name, attr_value)>
|
|
and C<(attr_name, qr/.../)>
|
|
criteria could be
|
|
expressed in terms of coderefs. However, C<(attr_name, attr_value)>
|
|
and C<(attr_name, qr/.../)>
|
|
criteria are a convenient shorthand. (In fact, C<look_down> itself is
|
|
basically "shorthand" too, since anything you can do with C<look_down>
|
|
you could do by traversing the tree, either with the C<traverse>
|
|
method or with a routine of your own. However, C<look_down> often
|
|
makes for very concise and clear code.)
|
|
|
|
=head2 look_up
|
|
|
|
@elements = $h->look_up( ...criteria... );
|
|
$first_match = $h->look_up( ...criteria... );
|
|
|
|
This is identical to C<< $h->look_down >>, except that whereas
|
|
C<< $h->look_down >>
|
|
basically scans over the list:
|
|
|
|
($h, $h->descendants)
|
|
|
|
C<< $h->look_up >> instead scans over the list
|
|
|
|
($h, $h->lineage)
|
|
|
|
So, for example, this returns all ancestors of C<$h> (possibly including
|
|
C<$h> itself) that are C<< <td> >> elements with an "align" attribute with a
|
|
value of "right" (or "RIGHT", etc.):
|
|
|
|
$h->look_up("_tag", "td", "align", "right");
|
|
|
|
=head2 traverse
|
|
|
|
$h->traverse(...options...)
|
|
|
|
Lengthy discussion of HTML::Element's unnecessary and confusing
|
|
C<traverse> method has been moved to a separate file:
|
|
L<HTML::Element::traverse>
|
|
|
|
=head2 attr_get_i
|
|
|
|
@values = $h->attr_get_i('attribute');
|
|
$first_value = $h->attr_get_i('attribute');
|
|
|
|
In list context, returns a list consisting of the values of the given
|
|
attribute for C<$h> and for all its ancestors starting from C<$h> and
|
|
working its way up. Nodes with no such attribute are skipped.
|
|
("attr_get_i" stands for "attribute get, with inheritance".)
|
|
In scalar context, returns the first such value, or undef if none.
|
|
|
|
Consider a document consisting of:
|
|
|
|
<html lang='i-klingon'>
|
|
<head><title>Pati Pata</title></head>
|
|
<body>
|
|
<h1 lang='la'>Stuff</h1>
|
|
<p lang='es-MX' align='center'>
|
|
Foo bar baz <cite>Quux</cite>.
|
|
</p>
|
|
<p>Hooboy.</p>
|
|
</body>
|
|
</html>
|
|
|
|
If C<$h> is the C<< <cite> >> element, C<< $h->attr_get_i("lang") >>
|
|
in list context will return the list C<('es-MX', 'i-klingon')>.
|
|
In scalar context, it will return the value C<'es-MX'>.
|
|
|
|
If you call with multiple attribute names...
|
|
|
|
@values = $h->attr_get_i('a1', 'a2', 'a3');
|
|
$first_value = $h->attr_get_i('a1', 'a2', 'a3');
|
|
|
|
...in list context, this will return a list consisting of
|
|
the values of these attributes which exist in C<$h> and its ancestors.
|
|
In scalar context, this returns the first value (i.e., the value of
|
|
the first existing attribute from the first element that has
|
|
any of the attributes listed). So, in the above example,
|
|
|
|
$h->attr_get_i('lang', 'align');
|
|
|
|
will return:
|
|
|
|
('es-MX', 'center', 'i-klingon') # in list context
|
|
or
|
|
'es-MX' # in scalar context.
|
|
|
|
But note that this:
|
|
|
|
$h->attr_get_i('align', 'lang');
|
|
|
|
will return:
|
|
|
|
('center', 'es-MX', 'i-klingon') # in list context
|
|
or
|
|
'center' # in scalar context.
|
|
|
|
=head2 tagname_map
|
|
|
|
$hash_ref = $h->tagname_map();
|
|
|
|
Scans across C<$h> and all its descendants, and makes a hash (a
|
|
reference to which is returned) where each entry consists of a key
|
|
that's a tag name, and a value that's a reference to a list to all
|
|
elements that have that tag name. I.e., this method returns:
|
|
|
|
{
|
|
# Across $h and all descendants...
|
|
'a' => [ ...list of all <a> elements... ],
|
|
'em' => [ ...list of all <em> elements... ],
|
|
'img' => [ ...list of all <img> elements... ],
|
|
}
|
|
|
|
(There are entries in the hash for only those tagnames that occur
|
|
at/under C<$h> -- so if there's no C<< <img> >> elements, there'll be no
|
|
"img" entry in the returned hashref.)
|
|
|
|
Example usage:
|
|
|
|
my $map_r = $h->tagname_map();
|
|
my @heading_tags = sort grep m/^h\d$/s, keys %$map_r;
|
|
if(@heading_tags) {
|
|
print "Heading levels used: @heading_tags\n";
|
|
} else {
|
|
print "No headings.\n"
|
|
}
|
|
|
|
=head2 extract_links
|
|
|
|
$links_array_ref = $h->extract_links();
|
|
$links_array_ref = $h->extract_links(@wantedTypes);
|
|
|
|
Returns links found by traversing the element and all of its children
|
|
and looking for attributes (like "href" in an C<< <a> >> element, or "src" in
|
|
an C<< <img> >> element) whose values represent links. The return value is a
|
|
I<reference> to an array. Each element of the array is reference to
|
|
an array with I<four> items: the link-value, the element that has the
|
|
attribute with that link-value, and the name of that attribute, and
|
|
the tagname of that element.
|
|
(Example: C<['http://www.suck.com/',> I<$elem_obj> C<, 'href', 'a']>.)
|
|
You may or may not end up using the
|
|
element itself -- for some purposes, you may use only the link value.
|
|
|
|
You might specify that you want to extract links from just some kinds
|
|
of elements (instead of the default, which is to extract links from
|
|
I<all> the kinds of elements known to have attributes whose values
|
|
represent links). For instance, if you want to extract links from
|
|
only C<< <a> >> and C<< <img> >> elements, you could code it like this:
|
|
|
|
for (@{ $e->extract_links('a', 'img') }) {
|
|
my($link, $element, $attr, $tag) = @$_;
|
|
print
|
|
"Hey, there's a $tag that links to ",
|
|
$link, ", in its $attr attribute, at ",
|
|
$element->address(), ".\n";
|
|
}
|
|
|
|
=head2 simplify_pres
|
|
|
|
$h->simplify_pres();
|
|
|
|
In text bits under PRE elements that are at/under C<$h>, this routine
|
|
nativizes all newlines, and expands all tabs.
|
|
|
|
That is, if you read a file with lines delimited by C<\cm\cj>'s, the
|
|
text under PRE areas will have C<\cm\cj>'s instead of C<\n>'s. Calling
|
|
C<< $h->simplify_pres >> on such a tree will turn C<\cm\cj>'s into
|
|
C<\n>'s.
|
|
|
|
Tabs are expanded to however many spaces it takes to get
|
|
to the next 8th column -- the usual way of expanding them.
|
|
|
|
=head2 same_as
|
|
|
|
$equal = $h->same_as($i)
|
|
|
|
Returns true if C<$h> and C<$i> are both elements representing the same tree
|
|
of elements, each with the same tag name, with the same explicit
|
|
attributes (i.e., not counting attributes whose names start with "_"),
|
|
and with the same content (textual, comments, etc.).
|
|
|
|
Sameness of descendant elements is tested, recursively, with
|
|
C<< $child1->same_as($child_2) >>, and sameness of text segments is tested
|
|
with C<$segment1 eq $segment2>.
|
|
|
|
=head2 new_from_lol
|
|
|
|
$h = HTML::Element->new_from_lol($array_ref);
|
|
@elements = HTML::Element->new_from_lol($array_ref, ...);
|
|
|
|
Resursively constructs a tree of nodes, based on the (non-cyclic)
|
|
data structure represented by each C<$array_ref>, where that is a reference
|
|
to an array of arrays (of arrays (of arrays (etc.))).
|
|
|
|
In each arrayref in that structure, different kinds of values are
|
|
treated as follows:
|
|
|
|
=over
|
|
|
|
=item * Arrayrefs
|
|
|
|
Arrayrefs are considered to
|
|
designate a sub-tree representing children for the node constructed
|
|
from the current arrayref.
|
|
|
|
=item * Hashrefs
|
|
|
|
Hashrefs are considered to contain
|
|
attribute-value pairs to add to the element to be constructed from
|
|
the current arrayref
|
|
|
|
=item * Text segments
|
|
|
|
Text segments at the start of any arrayref
|
|
will be considered to specify the name of the element to be
|
|
constructed from the current arrayref; all other text segments will
|
|
be considered to specify text segments as children for the current
|
|
arrayref.
|
|
|
|
=item * Elements
|
|
|
|
Existing element objects are either inserted into the treelet
|
|
constructed, or clones of them are. That is, when the lol-tree is
|
|
being traversed and elements constructed based what's in it, if
|
|
an existing element object is found, if it has no parent, then it is
|
|
added directly to the treelet constructed; but if it has a parent,
|
|
then C<$that_node-E<gt>clone> is added to the treelet at the
|
|
appropriate place.
|
|
|
|
=back
|
|
|
|
An example will hopefully make this more obvious:
|
|
|
|
my $h = HTML::Element->new_from_lol(
|
|
['html',
|
|
['head',
|
|
[ 'title', 'I like stuff!' ],
|
|
],
|
|
['body',
|
|
{'lang', 'en-JP', _implicit => 1},
|
|
'stuff',
|
|
['p', 'um, p < 4!', {'class' => 'par123'}],
|
|
['div', {foo => 'bar'}, '123'],
|
|
]
|
|
]
|
|
);
|
|
$h->dump;
|
|
|
|
Will print this:
|
|
|
|
<html> @0
|
|
<head> @0.0
|
|
<title> @0.0.0
|
|
"I like stuff!"
|
|
<body lang="en-JP"> @0.1 (IMPLICIT)
|
|
"stuff"
|
|
<p class="par123"> @0.1.1
|
|
"um, p < 4!"
|
|
<div foo="bar"> @0.1.2
|
|
"123"
|
|
|
|
And printing $h->as_HTML will give something like:
|
|
|
|
<html><head><title>I like stuff!</title></head>
|
|
<body lang="en-JP">stuff<p class="par123">um, p < 4!
|
|
<div foo="bar">123</div></body></html>
|
|
|
|
You can even do fancy things with C<map>:
|
|
|
|
$body->push_content(
|
|
# push_content implicitly calls new_from_lol on arrayrefs...
|
|
['br'],
|
|
['blockquote',
|
|
['h2', 'Pictures!'],
|
|
map ['p', $_],
|
|
$body2->look_down("_tag", "img"),
|
|
# images, to be copied from that other tree.
|
|
],
|
|
# and more stuff:
|
|
['ul',
|
|
map ['li', ['a', {'href'=>"$_.png"}, $_ ] ],
|
|
qw(Peaches Apples Pears Mangos)
|
|
],
|
|
);
|
|
|
|
In scalar context, you must supply exactly one arrayref. In list
|
|
context, you can pass a list of arrayrefs, and L<new_from_lol> will
|
|
return a list of elements, one for each arrayref.
|
|
|
|
@elements = HTML::Element->new_from_lol(
|
|
['hr'],
|
|
['p', 'And there, on the door, was a hook!'],
|
|
);
|
|
# constructs two elements.
|
|
|
|
=head2 objectify_text
|
|
|
|
$h->objectify_text();
|
|
|
|
This turns any text nodes under C<$h> from mere text segments (strings)
|
|
into real objects, pseudo-elements with a tag-name of "~text", and the
|
|
actual text content in an attribute called "text". (For a discussion
|
|
of pseudo-elements, see the L</"tag"> method, far above.) This method is
|
|
provided because, for some purposes, it is convenient or necessary to
|
|
be able, for a given text node, to ask what element is its parent; and
|
|
clearly this is not possible if a node is just a text string.
|
|
|
|
Note that these "~text" objects are not recognized as text nodes by
|
|
methods like L</as_text>. Presumably you will want to call
|
|
C<< $h->objectify_text >>, perform whatever task that you needed that for,
|
|
and then call C<< $h->deobjectify_text >> before calling anything like
|
|
C<< $h->as_text >>.
|
|
|
|
=head2 deobjectify_text
|
|
|
|
$h->deobjectify_text();
|
|
|
|
This undoes the effect of C<< $h->objectify_text >>. That is, it takes any
|
|
"~text" pseudo-elements in the tree at/under C<$h>, and deletes each one,
|
|
replacing each with the content of its "text" attribute.
|
|
|
|
Note that if C<$h> itself is a "~text" pseudo-element, it will be
|
|
destroyed -- a condition you may need to treat specially in your
|
|
calling code (since it means you can't very well do anything with C<$h>
|
|
after that). So that you can detect that condition, if C<$h> is itself a
|
|
"~text" pseudo-element, then this method returns the value of the
|
|
"text" attribute, which should be a defined value; in all other cases,
|
|
it returns undef.
|
|
|
|
(This method assumes that no "~text" pseudo-element has any children.)
|
|
|
|
=head2 number_lists
|
|
|
|
$h->number_lists();
|
|
|
|
For every UL, OL, DIR, and MENU element at/under C<$h>, this sets a
|
|
"_bullet" attribute for every child LI element. For LI children of an
|
|
OL, the "_bullet" attribute's value will be something like "4.", "d.",
|
|
"D.", "IV.", or "iv.", depending on the OL element's "type" attribute.
|
|
LI children of a UL, DIR, or MENU get their "_bullet" attribute set
|
|
to "*".
|
|
There should be no other LIs (i.e., except as children of OL, UL, DIR,
|
|
or MENU elements), and if there are, they are unaffected.
|
|
|
|
=head2 has_insane_linkage
|
|
|
|
$h->has_insane_linkage
|
|
|
|
This method is for testing whether this element or the elements
|
|
under it have linkage attributes (_parent and _content) whose values
|
|
are deeply aberrant: if there are undefs in a content list; if an
|
|
element appears in the content lists of more than one element;
|
|
if the _parent attribute of an element doesn't match its actual
|
|
parent; or if an element appears as its own descendant (i.e.,
|
|
if there is a cyclicity in the tree).
|
|
|
|
This returns empty list (or false, in scalar context) if the subtree's
|
|
linkage methods are sane; otherwise it returns two items (or true, in
|
|
scalar context): the element where the error occurred, and a string
|
|
describing the error.
|
|
|
|
This method is provided is mainly for debugging and troubleshooting --
|
|
it should be I<quite impossible> for any document constructed via
|
|
HTML::TreeBuilder to parse into a non-sane tree (since it's not
|
|
the content of the tree per se that's in question, but whether
|
|
the tree in memory was properly constructed); and it I<should> be
|
|
impossible for you to produce an insane tree just thru reasonable
|
|
use of normal documented structure-modifying methods. But if you're
|
|
constructing your own trees, and your program is going into infinite
|
|
loops as during calls to traverse() or any of the secondary
|
|
structural methods, as part of debugging, consider calling
|
|
C<has_insane_linkage> on the tree.
|
|
|
|
=head2 element_class
|
|
|
|
$classname = $h->element_class();
|
|
|
|
This method returns the class which will be used for new elements. It
|
|
defaults to HTML::Element, but can be overridden by subclassing or esoteric
|
|
means best left to those will will read the source and then not complain when
|
|
those esoteric means change. (Just subclass.)
|
|
|
|
=head1 CLASS METHODS
|
|
|
|
=head2 Use_Weak_Refs
|
|
|
|
$enabled = HTML::Element->Use_Weak_Refs;
|
|
HTML::Element->Use_Weak_Refs( $enabled );
|
|
|
|
This method allows you to check whether weak reference support is
|
|
enabled, and to enable or disable it. For details, see L</"Weak References">.
|
|
C<$enabled> is true if weak references are enabled.
|
|
|
|
You should not switch this in the middle of your program, and you
|
|
probably shouldn't use it at all. Existing trees are not affected by
|
|
this method (until you start modifying nodes in them).
|
|
|
|
Throws an exception if you attempt to enable weak references and your
|
|
Perl or Scalar::Util does not support them.
|
|
|
|
Disabling weak reference support is deprecated.
|
|
|
|
=head1 SUBROUTINES
|
|
|
|
=head2 Version
|
|
|
|
This subroutine is deprecated. Please use the standard VERSION method
|
|
(e.g. C<< HTML::Element->VERSION >>) instead.
|
|
|
|
=head2 ABORT OK PRUNE PRUNE_SOFTLY PRUNE_UP
|
|
|
|
Constants for signalling back to the traverser
|
|
|
|
=for Pod::Coverage as_text_trimmed
|
|
|
|
=head1 BUGS
|
|
|
|
* If you want to free the memory associated with a tree built of
|
|
HTML::Element nodes, and you have disabled weak references, then you
|
|
will have to delete it explicitly using the L</delete> method.
|
|
See L</"Weak References">.
|
|
|
|
* There's almost nothing to stop you from making a "tree" with
|
|
cyclicities (loops) in it, which could, for example, make the
|
|
traverse method go into an infinite loop. So don't make
|
|
cyclicities! (If all you're doing is parsing HTML files,
|
|
and looking at the resulting trees, this will never be a problem
|
|
for you.)
|
|
|
|
* There's no way to represent comments or processing directives
|
|
in a tree with HTML::Elements. Not yet, at least.
|
|
|
|
* There's (currently) nothing to stop you from using an undefined
|
|
value as a text segment. If you're running under C<perl -w>, however,
|
|
this may make HTML::Element's code produce a slew of warnings.
|
|
|
|
=head1 NOTES ON SUBCLASSING
|
|
|
|
You are welcome to derive subclasses from HTML::Element, but you
|
|
should be aware that the code in HTML::Element makes certain
|
|
assumptions about elements (and I'm using "element" to mean ONLY an
|
|
object of class HTML::Element, or of a subclass of HTML::Element):
|
|
|
|
* The value of an element's _parent attribute must either be undef or
|
|
otherwise false, or must be an element.
|
|
|
|
* The value of an element's _content attribute must either be undef or
|
|
otherwise false, or a reference to an (unblessed) array. The array
|
|
may be empty; but if it has items, they must ALL be either mere
|
|
strings (text segments), or elements.
|
|
|
|
* The value of an element's _tag attribute should, at least, be a
|
|
string of printable characters.
|
|
|
|
Moreover, bear these rules in mind:
|
|
|
|
* Do not break encapsulation on objects. That is, access their
|
|
contents only thru $obj->attr or more specific methods.
|
|
|
|
* You should think twice before completely overriding any of the
|
|
methods that HTML::Element provides. (Overriding with a method that
|
|
calls the superclass method is not so bad, though.)
|
|
|
|
=head1 SEE ALSO
|
|
|
|
L<HTML::Tree>; L<HTML::TreeBuilder>; L<HTML::AsSubs>; L<HTML::Tagset>;
|
|
and, for the morbidly curious, L<HTML::Element::traverse>.
|
|
|
|
=head1 ACKNOWLEDGEMENTS
|
|
|
|
Thanks to Mark-Jason Dominus for a POD suggestion.
|
|
|
|
=head1 AUTHOR
|
|
|
|
Current maintainers:
|
|
|
|
=over
|
|
|
|
=item * Christopher J. Madsen S<C<< <perl AT cjmweb.net> >>>
|
|
|
|
=item * Jeff Fearn S<C<< <jfearn AT cpan.org> >>>
|
|
|
|
=back
|
|
|
|
Original HTML-Tree author:
|
|
|
|
=over
|
|
|
|
=item * Gisle Aas
|
|
|
|
=back
|
|
|
|
Former maintainers:
|
|
|
|
=over
|
|
|
|
=item * Sean M. Burke
|
|
|
|
=item * Andy Lester
|
|
|
|
=item * Pete Krawczyk S<C<< <petek AT cpan.org> >>>
|
|
|
|
=back
|
|
|
|
You can follow or contribute to HTML-Tree's development at
|
|
L<< https://github.com/kentfredric/HTML-Tree >>.
|
|
|
|
=head1 COPYRIGHT AND LICENSE
|
|
|
|
Copyright 1995-1998 Gisle Aas, 1999-2004 Sean M. Burke,
|
|
2005 Andy Lester, 2006 Pete Krawczyk, 2010 Jeff Fearn,
|
|
2012 Christopher J. Madsen.
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself.
|
|
|
|
The programs in this library are distributed in the hope that they
|
|
will be useful, but without any warranty; without even the implied
|
|
warranty of merchantability or fitness for a particular purpose.
|
|
|
|
=cut
|