From 26071fbc9bb48adcfd263e7284a567428fda2803 Mon Sep 17 00:00:00 2001 From: Paul Bijnens Date: Wed, 1 Jun 2016 16:09:36 +0200 Subject: [PATCH 1/4] Parse internal DTDs in doctype declaration --- Parser.pm | 7 +++---- hparser.c | 39 +++++++++++++++++++++++++++++++++++++++ t/declaration.t | 22 +++++++++++++++++++++- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/Parser.pm b/Parser.pm index 9c2fc21..9e8d196 100644 --- a/Parser.pm +++ b/Parser.pm @@ -810,8 +810,9 @@ Tokens causes a reference to an array of token strings to be passed. The strings are exactly as they were found in the original text, no decoding or case changes are applied. -For C events, the array contains each word, comment, and -delimited string starting with the declaration type. +For C events, the array contains each word, comment, +internal dtd (including square brackets) and delimited string, +starting with the declaration type. For C events, this contains each sub-comment. If $p->strict_comments is disabled, there will be only one sub-comment. @@ -885,8 +886,6 @@ Example: -DTDs inside will confuse HTML::Parser. - =item C This event is triggered for events that do not have a specific diff --git a/hparser.c b/hparser.c index 3b2461e..ca93f7c 100644 --- a/hparser.c +++ b/hparser.c @@ -1184,6 +1184,45 @@ parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) s++; PUSH_TOKEN(str_beg, s); } + else if (*s == '[') { + /* internal DTD section between square brackets */ + char *intdtd_beg = s; + while (s < end && *s != ']') { /* get the internal dtd - beware of nested comments and strings maybe containing a ] char */ + if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { /* skip over a quoted string */ + char *str_beg = s; + s++; + while (s < end && *s != *str_beg) + s++; + if (s == end) + goto PREMATURE; + } else if (*s == '-') { /* and skip over the commment */ + s++; + if (s == end) + goto PREMATURE; + if (*s != '-') + goto FAIL; + s++; + while (1) { + while (s < end && *s != '-') + s++; + if (s == end) + goto PREMATURE; + s++; + if (s == end) + goto PREMATURE; + if (*s == '-') + break; + } + } + s++; + if (s == end) + goto PREMATURE; + } + s++; + if (s == end) + goto PREMATURE; + PUSH_TOKEN(intdtd_beg, s); + } else if (*s == '-') { /* comment */ char *com_beg = s; diff --git a/t/declaration.t b/t/declaration.t index 17de561..f93807b 100644 --- a/t/declaration.t +++ b/t/declaration.t @@ -1,4 +1,4 @@ -use Test::More tests => 2; +use Test::More tests => 3; use HTML::Parser; my $res = ""; @@ -60,3 +60,23 @@ is($res, <] EOT +$res = ""; +$p->parse(<eof; + + +] > +[Content] +EOT +is($res, < + + +<"pbi.dtd"> +<[ + + +]>] +[Content] +EOT + From 2c39f542eddfd095e94aaabef6b15f2674c53c92 Mon Sep 17 00:00:00 2001 From: Paul Bijnens Date: Sat, 4 Jun 2016 20:32:30 +0200 Subject: [PATCH 2/4] correctly parse comments in the internal DTD --- hparser.c | 104 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 19 deletions(-) diff --git a/hparser.c b/hparser.c index ca93f7c..07af65d 100644 --- a/hparser.c +++ b/hparser.c @@ -957,6 +957,88 @@ parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) return 0; } +static char* +skip_to_end_comment(PSTATE* p_state, char *beg, char *end) +{ + char *s = beg; + + ++s; + if (s == end || *s != '!') + return beg; + ++s; + if (s == end || *s != '-') + return beg; + ++s; + if (s == end || *s != '-') + return beg; + + if (p_state->strict_comment) { + char *start_com = s; /* also used to signal inside/outside */ + + while (1) { + /* try to locate "--" */ + FIND_DASH_DASH: + while (s < end && *s != '-' && *s != '>') + s++; + + if (s == end) + return beg; + + if (*s == '>') { + if (! start_com) + return s; + s++; + goto FIND_DASH_DASH; + } + + s++; + if (s == end) + return beg; + + if (*s == '-') { + /* two dashes in a row seen */ + s++; + if (start_com) + start_com = 0; + else + start_com = s; + } + } + } + else if (p_state->no_dash_dash_comment_end) { + /* a lone '>' signals end-of-comment */ + while (s < end && *s != '>') + s++; + if (s == end) + return beg; + return s; + } + else { /* non-strict comment */ + /* try to locate /--\s*>/ which signals end-of-comment */ + LOCATE_END: + while (s < end && *s != '-') + s++; + if (s < end) { + s++; + if (*s == '-') { + s++; + while (isHSPACE(*s)) + s++; + if (*s == '>') { + return s; + } + } + if (s < end) + goto LOCATE_END; + } + + if (s == end) + return beg; + } + + return 0; +} + #ifdef MARKED_SECTION @@ -1187,7 +1269,7 @@ parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) else if (*s == '[') { /* internal DTD section between square brackets */ char *intdtd_beg = s; - while (s < end && *s != ']') { /* get the internal dtd - beware of nested comments and strings maybe containing a ] char */ + while (s < end && *s != ']') { /* get the internal dtd - beware of nested comments and strings maybe containing a ] and > chars */ if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { /* skip over a quoted string */ char *str_beg = s; s++; @@ -1195,24 +1277,8 @@ parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) s++; if (s == end) goto PREMATURE; - } else if (*s == '-') { /* and skip over the commment */ - s++; - if (s == end) - goto PREMATURE; - if (*s != '-') - goto FAIL; - s++; - while (1) { - while (s < end && *s != '-') - s++; - if (s == end) - goto PREMATURE; - s++; - if (s == end) - goto PREMATURE; - if (*s == '-') - break; - } + } else if (*s == '<') { /* and skip over the commment */ + s = skip_to_end_comment(p_state, s, end); } s++; if (s == end) From dd630d4ee4582bbbf6fe1efcd13291ee6cc9d036 Mon Sep 17 00:00:00 2001 From: Paul Bijnens Date: Sat, 4 Jun 2016 20:39:34 +0200 Subject: [PATCH 3/4] test case internal dtd parsing --- t/declaration.t | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/t/declaration.t b/t/declaration.t index f93807b..1242fb6 100644 --- a/t/declaration.t +++ b/t/declaration.t @@ -65,6 +65,8 @@ $p->parse(<eof; +"> + ] > [Content] EOT @@ -76,6 +78,8 @@ is($res, < +"> + ]>] [Content] EOT From f3f1867f8ee0d0b07016d4e608fcc81eac23f9be Mon Sep 17 00:00:00 2001 From: Paul Bijnens Date: Sun, 5 Jun 2016 17:33:05 +0200 Subject: [PATCH 4/4] correctly handle comments inside entity decl and inside words --- hparser.c | 34 +++++++++++++++++++++++++++++++++- t/declaration.t | 6 ++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/hparser.c b/hparser.c index 07af65d..c4666cf 100644 --- a/hparser.c +++ b/hparser.c @@ -1269,6 +1269,9 @@ parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) else if (*s == '[') { /* internal DTD section between square brackets */ char *intdtd_beg = s; + s++; + if (s == end) + goto PREMATURE; while (s < end && *s != ']') { /* get the internal dtd - beware of nested comments and strings maybe containing a ] and > chars */ if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { /* skip over a quoted string */ char *str_beg = s; @@ -1277,9 +1280,38 @@ parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) s++; if (s == end) goto PREMATURE; - } else if (*s == '<') { /* and skip over the commment */ + } + else if (*s == '-') { /* comments inside entity decl */ + s++; + if (s == end) + goto PREMATURE; + if (*s != '-') + goto FAIL; + s++; + while (1) { + while (s < end && *s != '-') + s++; + if (s == end) + goto PREMATURE; + s++; + if (s == end) + goto PREMATURE; + if (*s == '-') + break; + } + if (s == end) + goto PREMATURE; + } + else if (*s == '<') { /* and skip over the */ s = skip_to_end_comment(p_state, s, end); } + else if (isALPHA(*s)) { /* skip over any word maybe containing hyphens */ + s++; + while (s < end && isHNOT_SPACE_GT(*s)) + s++; + if (s == end) + goto PREMATURE; + } s++; if (s == end) goto PREMATURE; diff --git a/t/declaration.t b/t/declaration.t index 1242fb6..1818ae3 100644 --- a/t/declaration.t +++ b/t/declaration.t @@ -65,7 +65,8 @@ $p->parse(<eof; -"> +" -- [comment] --> + ] > [Content] @@ -78,7 +79,8 @@ is($res, < -"> +" -- [comment] --> + ]>] [Content]