-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implementation of github query helper library.
To make this easier to read, use, and modify, I've abstracted the important parts of the github query api into crawler/github/query.go which allows to describe at a high level what is to be searched without knowing the API syntax.
- Loading branch information
Showing
2 changed files
with
348 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
package github | ||
|
||
import ( | ||
"fmt" | ||
"net/url" | ||
"strings" | ||
) | ||
|
||
const ( | ||
perPageArg = "per_page" | ||
accessTokenArg = "access_token" | ||
|
||
githubMaxPageSize = 100 | ||
) | ||
|
||
// Implementation detail, not important to external API. | ||
type queryField struct { | ||
name string | ||
value interface{} | ||
} | ||
|
||
// Formats a query field. | ||
func (qf queryField) String() string { | ||
var value string | ||
switch v := qf.value.(type) { | ||
case string: | ||
value = v | ||
case rangeFormatter: | ||
value = v.RangeString() | ||
default: | ||
value = fmt.Sprint(v) | ||
} | ||
|
||
if qf.name == "" { | ||
return value | ||
} | ||
return fmt.Sprint(qf.name, ":", value) | ||
} | ||
|
||
// Example of formating a query: | ||
// QueryWith( | ||
// Filename("kustomization.yaml"), | ||
// Filesize(RangeWithin{64, 192}), | ||
// Keyword("copyright"), | ||
// Keyword("2019"), | ||
// ).String() | ||
// | ||
// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which | ||
// would search for files that have [64, 192] bytes (inclusive range) and that | ||
// contain the keywords 'copyright' and '2019' somewhere in the file. | ||
type Query []queryField | ||
|
||
func QueryWith(qfs ...queryField) Query { | ||
return Query(qfs) | ||
} | ||
|
||
func (q Query) String() string { | ||
strs := make([]string, 0, len(q)) | ||
for _, elem := range q { | ||
str := elem.String() | ||
if str == "" { | ||
continue | ||
} | ||
strs = append(strs, str) | ||
} | ||
|
||
query := strings.Join(strs, "+") | ||
if query == "" { | ||
return query | ||
} | ||
return "q=" + query | ||
} | ||
|
||
// Keyword takes a single word, and formats it according to the Github API. | ||
func Keyword(k string) queryField { | ||
return queryField{value: k} | ||
} | ||
|
||
// Filesize takes a rangeFormatter and formats it according to the Github API. | ||
func Filesize(r rangeFormatter) queryField { | ||
return queryField{name: "size", value: r} | ||
} | ||
|
||
// Filename takes a filename and formats it according to the Github API. | ||
func Filename(f string) queryField { | ||
return queryField{name: "filename", value: f} | ||
} | ||
|
||
// Path takes a filepath and formats it according to the Github API. | ||
func Path(p string) queryField { | ||
return queryField{name: "path", value: p} | ||
} | ||
|
||
// RequestConfig stores common variables that must be present for the queries. | ||
// - CodeSearchRequests: ask Github to check the code indices given a query. | ||
// - ContentsRequests: ask Github where to download a resource given a repo and a | ||
// file path. | ||
// - CommitsRequests: asks Github to list commits made one a file. Useful to | ||
// determine the date of a file. | ||
type RequestConfig struct { | ||
perPage uint64 | ||
retryCount uint64 | ||
accessToken string | ||
} | ||
|
||
func NewRequestConfig( | ||
perPage, retryCount uint64, accessToken string) RequestConfig { | ||
|
||
return RequestConfig{ | ||
perPage: perPage, | ||
retryCount: retryCount, | ||
accessToken: accessToken, | ||
} | ||
} | ||
|
||
// CodeSearchRequestWith given a list of query parameters that specify the | ||
// (patial) query, returns a request object with the (parital) query. Must call | ||
// the URL method to get the string value of the URL. See request.CopyWith, to | ||
// understand why the request object is useful. | ||
func (rc RequestConfig) CodeSearchRequestWith(query Query) request { | ||
req := rc.makeRequest("search/code", query) | ||
req.vals.Set("sort", "indexed") | ||
req.vals.Set("order", "desc") | ||
return req | ||
} | ||
|
||
// ContentsRequest given the repo name, and the filepath returns a formatted | ||
// query for the Github API to find the dowload information of this filepath. | ||
func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string { | ||
uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path) | ||
return rc.makeRequest(uri, Query{}).URL() | ||
} | ||
|
||
// CommitsRequest given the repo name, and a filepath returns a formatted query | ||
// for the Github API to find the commits that affect this file. | ||
func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { | ||
uri := fmt.Sprintf("repos/%s/commits", fullRepoName) | ||
return rc.makeRequest(uri, Query{Path(path)}).URL() | ||
} | ||
|
||
// How many times to retry the queries before giving up (used by the crawler, | ||
// not Github). | ||
func (rc RequestConfig) RetryCount() uint64 { | ||
return rc.retryCount | ||
} | ||
|
||
func (rc RequestConfig) makeRequest(path string, query Query) request { | ||
vals := url.Values{} | ||
if rc.accessToken != "" { | ||
vals.Set(accessTokenArg, rc.accessToken) | ||
} | ||
vals.Set(perPageArg, fmt.Sprint(rc.perPage)) | ||
|
||
return request{ | ||
url: url.URL{ | ||
Scheme: "https", | ||
Host: "api.github.com", | ||
Path: path, | ||
}, | ||
vals: vals, | ||
query: query, | ||
} | ||
} | ||
|
||
type request struct { | ||
url url.URL | ||
vals url.Values | ||
query Query | ||
} | ||
|
||
// CopyWith copies the requests and adds the extra query parameters. Usefull | ||
// for dynamically adding sizes to a filename only query without modifying it. | ||
func (r request) CopyWith(queryParams ...queryField) request { | ||
cpy := r | ||
cpy.query = append(cpy.query, queryParams...) | ||
return cpy | ||
} | ||
|
||
// URL encodes the variables and the URL representation into a string. | ||
func (r request) URL() string { | ||
// Github does not handle URL encoding properly in its API for the | ||
// q='...', so the query parameter is added without any encoding | ||
// manually. | ||
encoded := r.vals.Encode() | ||
query := r.query.String() | ||
sep := "&" | ||
if query == "" { | ||
sep = "" | ||
} | ||
if encoded == "" && query != "" { | ||
sep = "?" | ||
} | ||
r.url.RawQuery = encoded + sep + query | ||
return r.url.String() | ||
} | ||
|
||
// Allows to define a range of numbers and print it in the github range | ||
// query format https://help.github.com/en/articles/understanding-the-search-syntax. | ||
type rangeFormatter interface { | ||
RangeString() string | ||
} | ||
|
||
// RangeLessThan is a range of values strictly less than (<) size. | ||
type RangeLessThan struct { | ||
size uint64 | ||
} | ||
|
||
func (r RangeLessThan) RangeString() string { | ||
return fmt.Sprintf("<%d", r.size) | ||
} | ||
|
||
// RangeLessThan is a range of values strictly greater than (>) size. | ||
type RangeGreaterThan struct { | ||
size uint64 | ||
} | ||
|
||
func (r RangeGreaterThan) RangeString() string { | ||
return fmt.Sprintf(">%d", r.size) | ||
} | ||
|
||
// RangeWithin is an inclusive range from start to end. | ||
type RangeWithin struct { | ||
start uint64 | ||
end uint64 | ||
} | ||
|
||
func (r RangeWithin) RangeString() string { | ||
return fmt.Sprintf("%d..%d", r.start, r.end) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
package github | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestQueryFields(t *testing.T) { | ||
testCases := []struct { | ||
formatter queryField | ||
expected string | ||
}{ | ||
{ | ||
formatter: Keyword("keyword"), | ||
expected: "keyword", | ||
}, | ||
{ | ||
formatter: Filesize(RangeLessThan{23}), | ||
expected: "size:<23", | ||
}, | ||
{ | ||
formatter: Filesize(RangeWithin{24, 64}), | ||
expected: "size:24..64", | ||
}, | ||
{ | ||
formatter: Filesize(RangeGreaterThan{64}), | ||
expected: "size:>64", | ||
}, | ||
{ | ||
formatter: Path("some/path/to/file"), | ||
expected: "path:some/path/to/file", | ||
}, | ||
{ | ||
formatter: Filename("kustomization.yaml"), | ||
expected: "filename:kustomization.yaml", | ||
}, | ||
} | ||
|
||
for _, test := range testCases { | ||
if result := test.formatter.String(); result != test.expected { | ||
t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected) | ||
} | ||
} | ||
} | ||
|
||
func TestQueryType(t *testing.T) { | ||
testCases := []struct { | ||
query Query | ||
expected string | ||
}{ | ||
{ | ||
query: QueryWith( | ||
Filesize(RangeWithin{24, 64}), | ||
Filename("kustomization.yaml"), | ||
Keyword("keyword1"), | ||
Keyword("keyword2"), | ||
), | ||
expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2", | ||
}, | ||
} | ||
|
||
for _, test := range testCases { | ||
if queryStr := test.query.String(); queryStr != test.expected { | ||
t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected) | ||
} | ||
|
||
} | ||
} | ||
|
||
func TestGithubSearchQuery(t *testing.T) { | ||
const ( | ||
accessToken = "random_token" | ||
perPage = 100 | ||
) | ||
|
||
testCases := []struct { | ||
rc RequestConfig | ||
codeQuery Query | ||
fullRepoName string | ||
path string | ||
expectedCodeQuery string | ||
expectedContentsQuery string | ||
expectedCommitsQuery string | ||
}{ | ||
{ | ||
rc: RequestConfig{ | ||
perPage: perPage, | ||
accessToken: accessToken, | ||
}, | ||
codeQuery: Query{ | ||
Filename("kustomization.yaml"), | ||
Filesize(RangeWithin{64, 128}), | ||
}, | ||
fullRepoName: "kubernetes-sigs/kustomize", | ||
path: "examples/helloWorld/kustomization.yaml", | ||
|
||
expectedCodeQuery: "https://api.github.com/search/code?" + | ||
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128", | ||
|
||
expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" + | ||
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100", | ||
|
||
expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" + | ||
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml", | ||
}, | ||
} | ||
|
||
for _, test := range testCases { | ||
if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery { | ||
t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery) | ||
} | ||
|
||
if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery { | ||
t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery) | ||
} | ||
if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery { | ||
t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery) | ||
} | ||
} | ||
} |