diff --git a/html2text.go b/html2text.go index 8fe9000..880ac07 100644 --- a/html2text.go +++ b/html2text.go @@ -2,6 +2,7 @@ package html2text import ( "bytes" + "fmt" "io" "regexp" "strings" @@ -19,6 +20,7 @@ type Options struct { PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements. OmitLinks bool // Turns on omitting links TextOnly bool // Returns only plain text + CitationStyleLinks bool // Uses citation style links like [1] } // PrettyTablesOptions overrides tablewriter behaviors @@ -71,13 +73,18 @@ func FromHTMLNode(doc *html.Node, o ...Options) (string, error) { } ctx := textifyTraverseContext{ - buf: bytes.Buffer{}, - options: options, + buf: bytes.Buffer{}, + options: options, + citationMap: map[string]int{}, } if err := ctx.traverse(doc); err != nil { return "", err } + if ctx.options.CitationStyleLinks && ctx.citationCount > 0 { + ctx.emitCitations() + } + text := strings.TrimSpace(newlineRe.ReplaceAllString( strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"), ) @@ -125,6 +132,8 @@ type textifyTraverseContext struct { blockquoteLevel int lineLength int isPre bool + citationCount int + citationMap map[string]int } // tableTraverseContext holds table ASCII-form related context. @@ -268,7 +277,11 @@ func (ctx *textifyTraverseContext) handleElement(node *html.Node) error { attrVal = ctx.normalizeHrefLink(attrVal) // Don't print link href if it matches link element content or if the link is empty. if (attrVal != "" && linkText != attrVal) && !ctx.options.OmitLinks && !ctx.options.TextOnly { - hrefLink = "( " + attrVal + " )" + if ctx.options.CitationStyleLinks { + hrefLink = ctx.addCitation(attrVal) + } else { + hrefLink = "( " + attrVal + " )" + } } } @@ -431,6 +444,25 @@ func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error { return nil } +// Tests r for being a character where no space should be inserted in front of. +func punctNoSpaceBefore(r rune) bool { + switch r { + case '.', ',', ';', '!', '?', ')', ']', '>': + return true + default: + return false + } +} + +// Tests r for being a character where no space should be inserted after. +func punctNoSpaceAfter(r rune) bool { + switch r { + case '(', '[', '<': + return true + default: + return false + } +} func (ctx *textifyTraverseContext) emit(data string) error { if data == "" { return nil @@ -441,14 +473,14 @@ func (ctx *textifyTraverseContext) emit(data string) error { ) for _, line := range lines { runes := []rune(line) - startsWithSpace := unicode.IsSpace(runes[0]) - if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") { + startsWithSpace := unicode.IsSpace(runes[0]) || punctNoSpaceBefore(runes[0]) + if !startsWithSpace && !ctx.endsWithSpace { if err = ctx.buf.WriteByte(' '); err != nil { return err } ctx.lineLength++ } - ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) + ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) || punctNoSpaceAfter(runes[len(runes)-1]) for _, c := range line { if _, err = ctx.buf.WriteString(string(c)); err != nil { return err @@ -516,6 +548,41 @@ func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string { return link } +func formatCitation(idx int) string { + return fmt.Sprintf("[%d]", idx) +} + +func (ctx *textifyTraverseContext) addCitation(url string) string { + idx, ok := ctx.citationMap[url] + + if !ok { + ctx.citationCount += 1 + idx = ctx.citationCount + ctx.citationMap[url] = idx + } + + return formatCitation(idx) +} + +func (ctx *textifyTraverseContext) emitCitations() { + // this method writes to the buffer directly instead of using `emit`, b/c we do not want to split long links + ctx.buf.WriteString("\n\n") + + // citations are ordered by link --> bring them into the correct order first + links := make([]string, ctx.citationCount) + + for k, v := range ctx.citationMap { + links[v-1] = k // arrays are 0-based, our citations are 1-based + } + + for i, link := range links { + ctx.buf.WriteString(formatCitation(i + 1)) + ctx.buf.WriteByte(' ') + ctx.buf.WriteString(link) + ctx.buf.WriteByte('\n') + } +} + // renderEachChild visits each direct child of a node and collects the sequence of // textuual representaitons separated by a single newline. func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) { diff --git a/html2text_test.go b/html2text_test.go index 452b45e..92f2575 100644 --- a/html2text_test.go +++ b/html2text_test.go @@ -521,6 +521,66 @@ func TestOmitLinks(t *testing.T) { } } +func TestCitationStyleLinks(t *testing.T) { + testCases := []struct { + input string + output string + }{ + { + ``, + ``, + }, + { + ``, + ``, + }, + { + ``, + "[1]\n\n[1] http://example.com/", + }, + { + `Link`, + "Link", + }, + { + `Link1Link2`, + "Link1 [1] Link2 [2]\n\n[1] http://example1.com/\n[2] http://example2.com/", + }, + { + `Link1 (Link2)`, + "Link1 [1] (Link2 [2])\n\n[1] http://example1.com/\n[2] http://example2.com/", + }, + { + `Link1? Link2!`, + "Link1 [1]? Link2 [2]!\n\n[1] http://example1.com/\n[2] http://example2.com/", + }, + { + `Link1Link1 again`, + "Link1 [1] Link1 again [1]\n\n[1] http://example1.com/", + }, + { + `Link`, + "Link [1]\n\n[1] http://example.com/", + }, + { + "\n\tLink\n\t", + "Link [1]\n\n[1] http://example.com/", + }, + { + `Example`, + "Example [1]\n\n[1] http://example.com/", + }, + } + + for _, testCase := range testCases { + if msg, err := wantString(testCase.input, testCase.output, Options{CitationStyleLinks: true}); err != nil { + t.Error(err) + } else if len(msg) > 0 { + t.Log(msg) + } + } +} + func TestImageAltTags(t *testing.T) { testCases := []struct { input string