Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 73 additions & 6 deletions html2text.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package html2text

import (
"bytes"
"fmt"
"io"
"regexp"
"strings"
Expand All @@ -19,6 +20,7 @@ type Options struct {
PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
OmitLinks bool // Turns on omitting links
TextOnly bool // Returns only plain text
CitationStyleLinks bool // Uses citation style links like [1]
}

// PrettyTablesOptions overrides tablewriter behaviors
Expand Down Expand Up @@ -71,13 +73,18 @@ func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
}

ctx := textifyTraverseContext{
buf: bytes.Buffer{},
options: options,
buf: bytes.Buffer{},
options: options,
citationMap: map[string]int{},
}
if err := ctx.traverse(doc); err != nil {
return "", err
}

if ctx.options.CitationStyleLinks && ctx.citationCount > 0 {
ctx.emitCitations()
}

text := strings.TrimSpace(newlineRe.ReplaceAllString(
strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"),
)
Expand Down Expand Up @@ -125,6 +132,8 @@ type textifyTraverseContext struct {
blockquoteLevel int
lineLength int
isPre bool
citationCount int
citationMap map[string]int
}

// tableTraverseContext holds table ASCII-form related context.
Expand Down Expand Up @@ -268,7 +277,11 @@ func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
attrVal = ctx.normalizeHrefLink(attrVal)
// Don't print link href if it matches link element content or if the link is empty.
if (attrVal != "" && linkText != attrVal) && !ctx.options.OmitLinks && !ctx.options.TextOnly {
hrefLink = "( " + attrVal + " )"
if ctx.options.CitationStyleLinks {
hrefLink = ctx.addCitation(attrVal)
} else {
hrefLink = "( " + attrVal + " )"
}
}
}

Expand Down Expand Up @@ -431,6 +444,25 @@ func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error {
return nil
}

// Tests r for being a character where no space should be inserted in front of.
func punctNoSpaceBefore(r rune) bool {
switch r {
case '.', ',', ';', '!', '?', ')', ']', '>':
return true
default:
return false
}
}

// Tests r for being a character where no space should be inserted after.
func punctNoSpaceAfter(r rune) bool {
switch r {
case '(', '[', '<':
return true
default:
return false
}
}
func (ctx *textifyTraverseContext) emit(data string) error {
if data == "" {
return nil
Expand All @@ -441,14 +473,14 @@ func (ctx *textifyTraverseContext) emit(data string) error {
)
for _, line := range lines {
runes := []rune(line)
startsWithSpace := unicode.IsSpace(runes[0])
if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") {
startsWithSpace := unicode.IsSpace(runes[0]) || punctNoSpaceBefore(runes[0])
if !startsWithSpace && !ctx.endsWithSpace {
if err = ctx.buf.WriteByte(' '); err != nil {
return err
}
ctx.lineLength++
}
ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) || punctNoSpaceAfter(runes[len(runes)-1])
for _, c := range line {
if _, err = ctx.buf.WriteString(string(c)); err != nil {
return err
Expand Down Expand Up @@ -516,6 +548,41 @@ func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string {
return link
}

func formatCitation(idx int) string {
return fmt.Sprintf("[%d]", idx)
}

func (ctx *textifyTraverseContext) addCitation(url string) string {
idx, ok := ctx.citationMap[url]

if !ok {
ctx.citationCount += 1
idx = ctx.citationCount
ctx.citationMap[url] = idx
}

return formatCitation(idx)
}

func (ctx *textifyTraverseContext) emitCitations() {
// this method writes to the buffer directly instead of using `emit`, b/c we do not want to split long links
ctx.buf.WriteString("\n\n")

// citations are ordered by link --> bring them into the correct order first
links := make([]string, ctx.citationCount)

for k, v := range ctx.citationMap {
links[v-1] = k // arrays are 0-based, our citations are 1-based
}

for i, link := range links {
ctx.buf.WriteString(formatCitation(i + 1))
ctx.buf.WriteByte(' ')
ctx.buf.WriteString(link)
ctx.buf.WriteByte('\n')
}
}

// renderEachChild visits each direct child of a node and collects the sequence of
// textuual representaitons separated by a single newline.
func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) {
Expand Down
60 changes: 60 additions & 0 deletions html2text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,66 @@ func TestOmitLinks(t *testing.T) {
}
}

func TestCitationStyleLinks(t *testing.T) {
testCases := []struct {
input string
output string
}{
{
`<a></a>`,
``,
},
{
`<a href=""></a>`,
``,
},
{
`<a href="http://example.com/"></a>`,
"[1]\n\n[1] http://example.com/",
},
{
`<a href="">Link</a>`,
"Link",
},
{
`<a href="http://example1.com/">Link1</a><a href="http://example2.com/">Link2</a>`,
"Link1 [1] Link2 [2]\n\n[1] http://example1.com/\n[2] http://example2.com/",
},
{
`<a href="http://example1.com/">Link1</a> (<a href="http://example2.com/">Link2</a>)`,
"Link1 [1] (Link2 [2])\n\n[1] http://example1.com/\n[2] http://example2.com/",
},
{
`<a href="http://example1.com/">Link1</a>? <a href="http://example2.com/">Link2</a>!`,
"Link1 [1]? Link2 [2]!\n\n[1] http://example1.com/\n[2] http://example2.com/",
},
{
`<a href="http://example1.com/">Link1</a><a href="http://example1.com/">Link1 again</a>`,
"Link1 [1] Link1 again [1]\n\n[1] http://example1.com/",
},
{
`<a href="http://example.com/"><span class="a">Link</span></a>`,
"Link [1]\n\n[1] http://example.com/",
},
{
"<a href='http://example.com/'>\n\t<span class='a'>Link</span>\n\t</a>",
"Link [1]\n\n[1] http://example.com/",
},
{
`<a href="http://example.com/"><img src="http://example.ru/hello.jpg" alt="Example"></a>`,
"Example [1]\n\n[1] http://example.com/",
},
}

for _, testCase := range testCases {
if msg, err := wantString(testCase.input, testCase.output, Options{CitationStyleLinks: true}); err != nil {
t.Error(err)
} else if len(msg) > 0 {
t.Log(msg)
}
}
}

func TestImageAltTags(t *testing.T) {
testCases := []struct {
input string
Expand Down