/
gosoup.go
142 lines (126 loc) · 4.86 KB
/
gosoup.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
package gosoup
import (
"bytes"
"golang.org/x/net/html"
"strings"
)
// Element is the root node returned from the api methods, it contains a pointer to the html.Node
// the underlying *html.Node could be accessed as element.Node
type Element struct {
*html.Node
}
// Attributes is a map[string]string that represents the attributes of an element
// it is used to call the Find and derivative methods conveniently as in the BeautifulSoup
// in example: Find("div", Attributes{"class":"exampleClass", "name":"exampleName"})
type Attributes map[string]string
// ParseAsHTML function parses the given string as html
// Returns an Element pointer to the root node and error if any error occurs
func ParseAsHTML(input string) (*Element, error) {
rootNode, err := html.Parse(strings.NewReader(input))
if err != nil {
return nil, err
}
return &Element{Node: rootNode}, nil
}
// Returns the string representation of the parse tree, it panics if there is an error in html.Render()
func (element *Element) String() string {
var buffer bytes.Buffer
err := html.Render(&buffer, element.Node)
if err != nil {
panic(err)
}
return buffer.String()
}
// Find method returns the first occurrence of the node with the given tagName and attributes
// returns nil if not found any node with the given parameters
func (element *Element) Find(tagName string, attributes Attributes) *Element {
foundElement, _ := find(element.Node, tagName, attributes, true, true)
return foundElement
}
// FindByTag method returns the first occurrence of the node with the given tagName, returns nil if not found
func (element *Element) FindByTag(tagName string) *Element {
foundElement, _ := find(element.Node, tagName, nil, true, false)
return foundElement
}
// FindByAttributes methods returns the first occurrence of the node with the given attributes
func (element *Element) FindByAttributes(attributes Attributes) *Element {
foundElement, _ := find(element.Node, "", attributes, false, true)
return foundElement
}
// FindAll method returns all nodes with the given tagName and attributes
func (element *Element) FindAll(tagName string, attributes Attributes) []*Element {
return findAll(element.Node, tagName, attributes, true, true)
}
// FindAllByTag method returns all nodes with the given tagName
func (element *Element) FindAllByTag(tagName string) []*Element {
return findAll(element.Node, tagName, nil, true, false)
}
// FindAllByAttributes method returns all nodes with the given attributes
func (element *Element) FindAllByAttributes(attributes Attributes) []*Element {
return findAll(element.Node, "", attributes, false, true)
}
// GetAttribute method behaves like a map lookup, returns the value of the attribute with the given key and true if found,
// returns "" and false if not found
func (element *Element) GetAttribute(attributeName string) (string, bool) {
for _, attribute := range element.Attr {
if attribute.Key == attributeName {
return attribute.Val, true
}
}
return "", false
}
func find(node *html.Node, tagName string, attributes Attributes, includeTagName, includeAttributes bool) (*Element, bool) {
if node.Type == html.ElementNode {
found := checkNode(node, tagName, attributes, includeTagName, includeAttributes)
if found {
return &Element{node}, found
}
}
// check child nodes
for nextNode := node.FirstChild; nextNode != nil; nextNode = nextNode.NextSibling {
element, found := find(nextNode, tagName, attributes, includeTagName, includeAttributes)
if found {
return element, found
}
}
return nil, false
}
func findAll(node *html.Node, tagName string, attributes Attributes, includeTagName, includeAttributes bool) []*Element {
var foundElements []*Element
if node.Type == html.ElementNode {
found := checkNode(node, tagName, attributes, includeTagName, includeAttributes)
if found {
foundElements = append(foundElements, &Element{node})
}
}
// check child nodes
for nextNode := node.FirstChild; nextNode != nil; nextNode = nextNode.NextSibling {
elements := findAll(nextNode, tagName, attributes, includeTagName, includeAttributes)
foundElements = append(foundElements, elements...)
}
return foundElements
}
func checkNode(node *html.Node, tagName string, attributes Attributes, includeTagName, includeAttributes bool) bool {
foundByTag := includeTagName && node.Data == tagName
foundByAttributes := includeAttributes && checkAttributes(node, attributes)
if includeTagName && includeAttributes {
return foundByTag && foundByAttributes
}
return foundByTag || foundByAttributes
}
func checkAttributes(node *html.Node, attributes Attributes) bool {
var found bool
for name, value := range attributes {
found = false
for _, attribute := range node.Attr {
if attribute.Key == name && attribute.Val == value {
found = true
break
}
}
if !found { // if one of the given attributes is not found, no need to look for others
return false
}
}
return true
}