From db9dcc1ce5ceba544364b5b8ee4ab3798eb77836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20=27Necoro=27=20Neumann?= Date: Tue, 28 Mar 2023 20:03:23 +0200 Subject: [PATCH 1/3] Support citation style links --- html2text.go | 53 ++++++++++++++++++++++++++++++++++++++++++++--- html2text_test.go | 52 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/html2text.go b/html2text.go index 8fe9000..25bb00a 100644 --- a/html2text.go +++ b/html2text.go @@ -2,6 +2,7 @@ package html2text import ( "bytes" + "fmt" "io" "regexp" "strings" @@ -19,6 +20,7 @@ type Options struct { PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements. OmitLinks bool // Turns on omitting links TextOnly bool // Returns only plain text + CitationStyleLinks bool // Uses citation style links like [1] } // PrettyTablesOptions overrides tablewriter behaviors @@ -71,13 +73,18 @@ func FromHTMLNode(doc *html.Node, o ...Options) (string, error) { } ctx := textifyTraverseContext{ - buf: bytes.Buffer{}, - options: options, + buf: bytes.Buffer{}, + options: options, + citationMap: map[string]int{}, } if err := ctx.traverse(doc); err != nil { return "", err } + if ctx.options.CitationStyleLinks && ctx.citationCount > 0 { + ctx.emitCitations() + } + text := strings.TrimSpace(newlineRe.ReplaceAllString( strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"), ) @@ -125,6 +132,8 @@ type textifyTraverseContext struct { blockquoteLevel int lineLength int isPre bool + citationCount int + citationMap map[string]int } // tableTraverseContext holds table ASCII-form related context. @@ -268,7 +277,11 @@ func (ctx *textifyTraverseContext) handleElement(node *html.Node) error { attrVal = ctx.normalizeHrefLink(attrVal) // Don't print link href if it matches link element content or if the link is empty. if (attrVal != "" && linkText != attrVal) && !ctx.options.OmitLinks && !ctx.options.TextOnly { - hrefLink = "( " + attrVal + " )" + if ctx.options.CitationStyleLinks { + hrefLink = ctx.addCitation(attrVal) + } else { + hrefLink = "( " + attrVal + " )" + } } } @@ -516,6 +529,40 @@ func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string { return link } +func formatCitation(idx int) string { + return fmt.Sprintf("[%d] ", idx) +} + +func (ctx *textifyTraverseContext) addCitation(url string) string { + idx, ok := ctx.citationMap[url] + + if !ok { + ctx.citationCount += 1 + idx = ctx.citationCount + ctx.citationMap[url] = idx + } + + return formatCitation(idx) +} + +func (ctx *textifyTraverseContext) emitCitations() { + // this method writes to the buffer directly instead of using `emit`, b/c we do not want to split long links + ctx.buf.WriteString("\n\n") + + // citations are ordered by link --> bring them into the correct order first + links := make([]string, ctx.citationCount) + + for k, v := range ctx.citationMap { + links[v-1] = k // arrays are 0-based, our citations are 1-based + } + + for i, link := range links { + ctx.buf.WriteString(formatCitation(i + 1)) + ctx.buf.WriteString(link) + ctx.buf.WriteByte('\n') + } +} + // renderEachChild visits each direct child of a node and collects the sequence of // textuual representaitons separated by a single newline. func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) { diff --git a/html2text_test.go b/html2text_test.go index 452b45e..e256c68 100644 --- a/html2text_test.go +++ b/html2text_test.go @@ -521,6 +521,58 @@ func TestOmitLinks(t *testing.T) { } } +func TestCitationStyleLinks(t *testing.T) { + testCases := []struct { + input string + output string + }{ + { + ``, + ``, + }, + { + ``, + ``, + }, + { + ``, + "[1] \n\n[1] http://example.com/", + }, + { + `Link`, + "Link", + }, + { + `Link1Link2`, + "Link1 [1] Link2 [2] \n\n[1] http://example1.com/\n[2] http://example2.com/", + }, + { + `Link1Link1 again`, + "Link1 [1] Link1 again [1] \n\n[1] http://example1.com/", + }, + { + `Link`, + "Link [1] \n\n[1] http://example.com/", + }, + { + "\n\tLink\n\t", + "Link [1] \n\n[1] http://example.com/", + }, + { + `Example`, + "Example [1] \n\n[1] http://example.com/", + }, + } + + for _, testCase := range testCases { + if msg, err := wantString(testCase.input, testCase.output, Options{CitationStyleLinks: true}); err != nil { + t.Error(err) + } else if len(msg) > 0 { + t.Log(msg) + } + } +} + func TestImageAltTags(t *testing.T) { testCases := []struct { input string From da410a0926ab5f2f06fd010345f7e16034df18d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20=27Necoro=27=20Neumann?= Date: Sun, 10 May 2020 22:42:36 +0200 Subject: [PATCH 2/3] Do not add spurious spaces after citation reference --- html2text.go | 3 ++- html2text_test.go | 14 +++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/html2text.go b/html2text.go index 25bb00a..e61ddd0 100644 --- a/html2text.go +++ b/html2text.go @@ -530,7 +530,7 @@ func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string { } func formatCitation(idx int) string { - return fmt.Sprintf("[%d] ", idx) + return fmt.Sprintf("[%d]", idx) } func (ctx *textifyTraverseContext) addCitation(url string) string { @@ -558,6 +558,7 @@ func (ctx *textifyTraverseContext) emitCitations() { for i, link := range links { ctx.buf.WriteString(formatCitation(i + 1)) + ctx.buf.WriteByte(' ') ctx.buf.WriteString(link) ctx.buf.WriteByte('\n') } diff --git a/html2text_test.go b/html2text_test.go index e256c68..71558a5 100644 --- a/html2text_test.go +++ b/html2text_test.go @@ -536,31 +536,31 @@ func TestCitationStyleLinks(t *testing.T) { }, { ``, - "[1] \n\n[1] http://example.com/", + "[1]\n\n[1] http://example.com/", }, { `Link`, "Link", }, { - `Link1Link2`, - "Link1 [1] Link2 [2] \n\n[1] http://example1.com/\n[2] http://example2.com/", + `Link1? Link2!`, + "Link1 [1]? Link2 [2]!\n\n[1] http://example1.com/\n[2] http://example2.com/", }, { `Link1Link1 again`, - "Link1 [1] Link1 again [1] \n\n[1] http://example1.com/", + "Link1 [1] Link1 again [1]\n\n[1] http://example1.com/", }, { `Link`, - "Link [1] \n\n[1] http://example.com/", + "Link [1]\n\n[1] http://example.com/", }, { "\n\tLink\n\t", - "Link [1] \n\n[1] http://example.com/", + "Link [1]\n\n[1] http://example.com/", }, { `Example`, - "Example [1] \n\n[1] http://example.com/", + "Example [1]\n\n[1] http://example.com/", }, } From c5c8650eb0c22131e2c84788040c38869b663d9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20=27Necoro=27=20Neumann?= Date: Sun, 10 May 2020 22:49:18 +0200 Subject: [PATCH 3/3] Do not add spaces in front of certain punctuation marks like '.', '?' or ')'. But '(' would still get its space. --- html2text.go | 25 ++++++++++++++++++++++--- html2text_test.go | 8 ++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/html2text.go b/html2text.go index e61ddd0..880ac07 100644 --- a/html2text.go +++ b/html2text.go @@ -444,6 +444,25 @@ func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error { return nil } +// Tests r for being a character where no space should be inserted in front of. +func punctNoSpaceBefore(r rune) bool { + switch r { + case '.', ',', ';', '!', '?', ')', ']', '>': + return true + default: + return false + } +} + +// Tests r for being a character where no space should be inserted after. +func punctNoSpaceAfter(r rune) bool { + switch r { + case '(', '[', '<': + return true + default: + return false + } +} func (ctx *textifyTraverseContext) emit(data string) error { if data == "" { return nil @@ -454,14 +473,14 @@ func (ctx *textifyTraverseContext) emit(data string) error { ) for _, line := range lines { runes := []rune(line) - startsWithSpace := unicode.IsSpace(runes[0]) - if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") { + startsWithSpace := unicode.IsSpace(runes[0]) || punctNoSpaceBefore(runes[0]) + if !startsWithSpace && !ctx.endsWithSpace { if err = ctx.buf.WriteByte(' '); err != nil { return err } ctx.lineLength++ } - ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) + ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) || punctNoSpaceAfter(runes[len(runes)-1]) for _, c := range line { if _, err = ctx.buf.WriteString(string(c)); err != nil { return err diff --git a/html2text_test.go b/html2text_test.go index 71558a5..92f2575 100644 --- a/html2text_test.go +++ b/html2text_test.go @@ -542,6 +542,14 @@ func TestCitationStyleLinks(t *testing.T) { `Link`, "Link", }, + { + `Link1Link2`, + "Link1 [1] Link2 [2]\n\n[1] http://example1.com/\n[2] http://example2.com/", + }, + { + `Link1 (Link2)`, + "Link1 [1] (Link2 [2])\n\n[1] http://example1.com/\n[2] http://example2.com/", + }, { `Link1? Link2!`, "Link1 [1]? Link2 [2]!\n\n[1] http://example1.com/\n[2] http://example2.com/",