diff --git a/Parser.pm b/Parser.pm index 9c2fc21..9e8d196 100644 --- a/Parser.pm +++ b/Parser.pm @@ -810,8 +810,9 @@ Tokens causes a reference to an array of token strings to be passed. The strings are exactly as they were found in the original text, no decoding or case changes are applied. -For C events, the array contains each word, comment, and -delimited string starting with the declaration type. +For C events, the array contains each word, comment, +internal dtd (including square brackets) and delimited string, +starting with the declaration type. For C events, this contains each sub-comment. If $p->strict_comments is disabled, there will be only one sub-comment. @@ -885,8 +886,6 @@ Example: -DTDs inside will confuse HTML::Parser. - =item C This event is triggered for events that do not have a specific diff --git a/hparser.c b/hparser.c index 3b2461e..c4666cf 100644 --- a/hparser.c +++ b/hparser.c @@ -957,6 +957,88 @@ parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) return 0; } +static char* +skip_to_end_comment(PSTATE* p_state, char *beg, char *end) +{ + char *s = beg; + + ++s; + if (s == end || *s != '!') + return beg; + ++s; + if (s == end || *s != '-') + return beg; + ++s; + if (s == end || *s != '-') + return beg; + + if (p_state->strict_comment) { + char *start_com = s; /* also used to signal inside/outside */ + + while (1) { + /* try to locate "--" */ + FIND_DASH_DASH: + while (s < end && *s != '-' && *s != '>') + s++; + + if (s == end) + return beg; + + if (*s == '>') { + if (! start_com) + return s; + s++; + goto FIND_DASH_DASH; + } + + s++; + if (s == end) + return beg; + + if (*s == '-') { + /* two dashes in a row seen */ + s++; + if (start_com) + start_com = 0; + else + start_com = s; + } + } + } + else if (p_state->no_dash_dash_comment_end) { + /* a lone '>' signals end-of-comment */ + while (s < end && *s != '>') + s++; + if (s == end) + return beg; + return s; + } + else { /* non-strict comment */ + /* try to locate /--\s*>/ which signals end-of-comment */ + LOCATE_END: + while (s < end && *s != '-') + s++; + if (s < end) { + s++; + if (*s == '-') { + s++; + while (isHSPACE(*s)) + s++; + if (*s == '>') { + return s; + } + } + if (s < end) + goto LOCATE_END; + } + + if (s == end) + return beg; + } + + return 0; +} + #ifdef MARKED_SECTION @@ -1184,6 +1266,61 @@ parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) s++; PUSH_TOKEN(str_beg, s); } + else if (*s == '[') { + /* internal DTD section between square brackets */ + char *intdtd_beg = s; + s++; + if (s == end) + goto PREMATURE; + while (s < end && *s != ']') { /* get the internal dtd - beware of nested comments and strings maybe containing a ] and > chars */ + if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { /* skip over a quoted string */ + char *str_beg = s; + s++; + while (s < end && *s != *str_beg) + s++; + if (s == end) + goto PREMATURE; + } + else if (*s == '-') { /* comments inside entity decl */ + s++; + if (s == end) + goto PREMATURE; + if (*s != '-') + goto FAIL; + s++; + while (1) { + while (s < end && *s != '-') + s++; + if (s == end) + goto PREMATURE; + s++; + if (s == end) + goto PREMATURE; + if (*s == '-') + break; + } + if (s == end) + goto PREMATURE; + } + else if (*s == '<') { /* and skip over the */ + s = skip_to_end_comment(p_state, s, end); + } + else if (isALPHA(*s)) { /* skip over any word maybe containing hyphens */ + s++; + while (s < end && isHNOT_SPACE_GT(*s)) + s++; + if (s == end) + goto PREMATURE; + } + s++; + if (s == end) + goto PREMATURE; + } + s++; + if (s == end) + goto PREMATURE; + PUSH_TOKEN(intdtd_beg, s); + } else if (*s == '-') { /* comment */ char *com_beg = s; diff --git a/t/declaration.t b/t/declaration.t index 17de561..1818ae3 100644 --- a/t/declaration.t +++ b/t/declaration.t @@ -1,4 +1,4 @@ -use Test::More tests => 2; +use Test::More tests => 3; use HTML::Parser; my $res = ""; @@ -60,3 +60,29 @@ is($res, <] EOT +$res = ""; +$p->parse(<eof; + + +" -- [comment] --> + + +] > +[Content] +EOT +is($res, < + + +<"pbi.dtd"> +<[ + + +" -- [comment] --> + + +]>] +[Content] +EOT +