Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions Parser.pm
Original file line number Diff line number Diff line change
Expand Up @@ -810,8 +810,9 @@ Tokens causes a reference to an array of token strings to be passed.
The strings are exactly as they were found in the original text,
no decoding or case changes are applied.

For C<declaration> events, the array contains each word, comment, and
delimited string starting with the declaration type.
For C<declaration> events, the array contains each word, comment,
internal dtd (including square brackets) and delimited string,
starting with the declaration type.

For C<comment> events, this contains each sub-comment. If
$p->strict_comments is disabled, there will be only one sub-comment.
Expand Down Expand Up @@ -885,8 +886,6 @@ Example:
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">

DTDs inside <!DOCTYPE ...> will confuse HTML::Parser.

=item C<default>

This event is triggered for events that do not have a specific
Expand Down
137 changes: 137 additions & 0 deletions hparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,88 @@ parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
return 0;
}

static char*
skip_to_end_comment(PSTATE* p_state, char *beg, char *end)
{
char *s = beg;

++s;
if (s == end || *s != '!')
return beg;
++s;
if (s == end || *s != '-')
return beg;
++s;
if (s == end || *s != '-')
return beg;

if (p_state->strict_comment) {
char *start_com = s; /* also used to signal inside/outside */

while (1) {
/* try to locate "--" */
FIND_DASH_DASH:
while (s < end && *s != '-' && *s != '>')
s++;

if (s == end)
return beg;

if (*s == '>') {
if (! start_com)
return s;
s++;
goto FIND_DASH_DASH;
}

s++;
if (s == end)
return beg;

if (*s == '-') {
/* two dashes in a row seen */
s++;
if (start_com)
start_com = 0;
else
start_com = s;
}
}
}
else if (p_state->no_dash_dash_comment_end) {
/* a lone '>' signals end-of-comment */
while (s < end && *s != '>')
s++;
if (s == end)
return beg;
return s;
}
else { /* non-strict comment */
/* try to locate /--\s*>/ which signals end-of-comment */
LOCATE_END:
while (s < end && *s != '-')
s++;
if (s < end) {
s++;
if (*s == '-') {
s++;
while (isHSPACE(*s))
s++;
if (*s == '>') {
return s;
}
}
if (s < end)
goto LOCATE_END;
}

if (s == end)
return beg;
}

return 0;
}


#ifdef MARKED_SECTION

Expand Down Expand Up @@ -1184,6 +1266,61 @@ parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
s++;
PUSH_TOKEN(str_beg, s);
}
else if (*s == '[') {
/* internal DTD section between square brackets */
char *intdtd_beg = s;
s++;
if (s == end)
goto PREMATURE;
while (s < end && *s != ']') { /* get the internal dtd - beware of nested comments and strings maybe containing a ] and > chars */
if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { /* skip over a quoted string */
char *str_beg = s;
s++;
while (s < end && *s != *str_beg)
s++;
if (s == end)
goto PREMATURE;
}
else if (*s == '-') { /* comments inside entity decl <!ENTITY bracket "]" -- a ] char --> */
s++;
if (s == end)
goto PREMATURE;
if (*s != '-')
goto FAIL;
s++;
while (1) {
while (s < end && *s != '-')
s++;
if (s == end)
goto PREMATURE;
s++;
if (s == end)
goto PREMATURE;
if (*s == '-')
break;
}
if (s == end)
goto PREMATURE;
}
else if (*s == '<') { /* and skip over the <!-- commment --> */
s = skip_to_end_comment(p_state, s, end);
}
else if (isALPHA(*s)) { /* skip over any word maybe containing hyphens */
s++;
while (s < end && isHNOT_SPACE_GT(*s))
s++;
if (s == end)
goto PREMATURE;
}
s++;
if (s == end)
goto PREMATURE;
}
s++;
if (s == end)
goto PREMATURE;
PUSH_TOKEN(intdtd_beg, s);
}
else if (*s == '-') {
/* comment */
char *com_beg = s;
Expand Down
28 changes: 27 additions & 1 deletion t/declaration.t
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use Test::More tests => 2;
use Test::More tests => 3;

use HTML::Parser;
my $res = "";
Expand Down Expand Up @@ -60,3 +60,29 @@ is($res, <<EOT);
<[]>]
EOT

$res = "";
$p->parse(<<EOT)->eof;
<!DOCTYPE pbi SYSTEM "pbi.dtd" [
<!-- the [internal] dtd -->
<!ENTITY brackets "[]">
<!ENTITY my-ent "<!-- [foo] -->" -- [comment] -->
<!ENTITY dbl--bar " -- [bar] -- ">
<!-- end of internal dtd -->
] >
<pbi>[Content]</pbi>
EOT
is($res, <<EOT);
[<DOCTYPE>
<pbi>
<SYSTEM>
<"pbi.dtd">
<[
<!-- the [internal] dtd -->
<!ENTITY brackets "[]">
<!ENTITY my-ent "<!-- [foo] -->" -- [comment] -->
<!ENTITY dbl--bar " -- [bar] -- ">
<!-- end of internal dtd -->
]>]
<pbi>[Content]</pbi>
EOT