Skip to content

Commit

Permalink
Implementation of github query helper library.
Browse files Browse the repository at this point in the history
To make this easier to read, use, and modify, I've abstracted the
important parts of the github query api into crawler/github/query.go
which allows to describe at a high level what is to be searched without
knowing the API syntax.
  • Loading branch information
damienr74 committed Aug 16, 2019
1 parent ca41674 commit ac6918d
Show file tree
Hide file tree
Showing 2 changed files with 348 additions and 0 deletions.
229 changes: 229 additions & 0 deletions internal/search/crawler/github/queries.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
package github

import (
"fmt"
"net/url"
"strings"
)

const (
perPageArg = "per_page"
accessTokenArg = "access_token"

githubMaxPageSize = 100
)

// Implementation detail, not important to external API.
type queryField struct {
name string
value interface{}
}

// Formats a query field.
func (qf queryField) String() string {
var value string
switch v := qf.value.(type) {
case string:
value = v
case rangeFormatter:
value = v.RangeString()
default:
value = fmt.Sprint(v)
}

if qf.name == "" {
return value
}
return fmt.Sprint(qf.name, ":", value)
}

// Example of formating a query:
// QueryWith(
// Filename("kustomization.yaml"),
// Filesize(RangeWithin{64, 192}),
// Keyword("copyright"),
// Keyword("2019"),
// ).String()
//
// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which
// would search for files that have [64, 192] bytes (inclusive range) and that
// contain the keywords 'copyright' and '2019' somewhere in the file.
type Query []queryField

func QueryWith(qfs ...queryField) Query {
return Query(qfs)
}

func (q Query) String() string {
strs := make([]string, 0, len(q))
for _, elem := range q {
str := elem.String()
if str == "" {
continue
}
strs = append(strs, str)
}

query := strings.Join(strs, "+")
if query == "" {
return query
}
return "q=" + query
}

// Keyword takes a single word, and formats it according to the Github API.
func Keyword(k string) queryField {
return queryField{value: k}
}

// Filesize takes a rangeFormatter and formats it according to the Github API.
func Filesize(r rangeFormatter) queryField {
return queryField{name: "size", value: r}
}

// Filename takes a filename and formats it according to the Github API.
func Filename(f string) queryField {
return queryField{name: "filename", value: f}
}

// Path takes a filepath and formats it according to the Github API.
func Path(p string) queryField {
return queryField{name: "path", value: p}
}

// RequestConfig stores common variables that must be present for the queries.
// - CodeSearchRequests: ask Github to check the code indices given a query.
// - ContentsRequests: ask Github where to download a resource given a repo and a
// file path.
// - CommitsRequests: asks Github to list commits made one a file. Useful to
// determine the date of a file.
type RequestConfig struct {
perPage uint64
retryCount uint64
accessToken string
}

func NewRequestConfig(
perPage, retryCount uint64, accessToken string) RequestConfig {

return RequestConfig{
perPage: perPage,
retryCount: retryCount,
accessToken: accessToken,
}
}

// CodeSearchRequestWith given a list of query parameters that specify the
// (patial) query, returns a request object with the (parital) query. Must call
// the URL method to get the string value of the URL. See request.CopyWith, to
// understand why the request object is useful.
func (rc RequestConfig) CodeSearchRequestWith(query Query) request {
req := rc.makeRequest("search/code", query)
req.vals.Set("sort", "indexed")
req.vals.Set("order", "desc")
return req
}

// ContentsRequest given the repo name, and the filepath returns a formatted
// query for the Github API to find the dowload information of this filepath.
func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string {
uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path)
return rc.makeRequest(uri, Query{}).URL()
}

// CommitsRequest given the repo name, and a filepath returns a formatted query
// for the Github API to find the commits that affect this file.
func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
uri := fmt.Sprintf("repos/%s/commits", fullRepoName)
return rc.makeRequest(uri, Query{Path(path)}).URL()
}

// How many times to retry the queries before giving up (used by the crawler,
// not Github).
func (rc RequestConfig) RetryCount() uint64 {
return rc.retryCount
}

func (rc RequestConfig) makeRequest(path string, query Query) request {
vals := url.Values{}
if rc.accessToken != "" {
vals.Set(accessTokenArg, rc.accessToken)
}
vals.Set(perPageArg, fmt.Sprint(rc.perPage))

return request{
url: url.URL{
Scheme: "https",
Host: "api.github.com",
Path: path,
},
vals: vals,
query: query,
}
}

type request struct {
url url.URL
vals url.Values
query Query
}

// CopyWith copies the requests and adds the extra query parameters. Usefull
// for dynamically adding sizes to a filename only query without modifying it.
func (r request) CopyWith(queryParams ...queryField) request {
cpy := r
cpy.query = append(cpy.query, queryParams...)
return cpy
}

// URL encodes the variables and the URL representation into a string.
func (r request) URL() string {
// Github does not handle URL encoding properly in its API for the
// q='...', so the query parameter is added without any encoding
// manually.
encoded := r.vals.Encode()
query := r.query.String()
sep := "&"
if query == "" {
sep = ""
}
if encoded == "" && query != "" {
sep = "?"
}
r.url.RawQuery = encoded + sep + query
return r.url.String()
}

// Allows to define a range of numbers and print it in the github range
// query format https://help.github.com/en/articles/understanding-the-search-syntax.
type rangeFormatter interface {
RangeString() string
}

// RangeLessThan is a range of values strictly less than (<) size.
type RangeLessThan struct {
size uint64
}

func (r RangeLessThan) RangeString() string {
return fmt.Sprintf("<%d", r.size)
}

// RangeLessThan is a range of values strictly greater than (>) size.
type RangeGreaterThan struct {
size uint64
}

func (r RangeGreaterThan) RangeString() string {
return fmt.Sprintf(">%d", r.size)
}

// RangeWithin is an inclusive range from start to end.
type RangeWithin struct {
start uint64
end uint64
}

func (r RangeWithin) RangeString() string {
return fmt.Sprintf("%d..%d", r.start, r.end)
}
119 changes: 119 additions & 0 deletions internal/search/crawler/github/queries_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package github

import (
"testing"
)

func TestQueryFields(t *testing.T) {
testCases := []struct {
formatter queryField
expected string
}{
{
formatter: Keyword("keyword"),
expected: "keyword",
},
{
formatter: Filesize(RangeLessThan{23}),
expected: "size:<23",
},
{
formatter: Filesize(RangeWithin{24, 64}),
expected: "size:24..64",
},
{
formatter: Filesize(RangeGreaterThan{64}),
expected: "size:>64",
},
{
formatter: Path("some/path/to/file"),
expected: "path:some/path/to/file",
},
{
formatter: Filename("kustomization.yaml"),
expected: "filename:kustomization.yaml",
},
}

for _, test := range testCases {
if result := test.formatter.String(); result != test.expected {
t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected)
}
}
}

func TestQueryType(t *testing.T) {
testCases := []struct {
query Query
expected string
}{
{
query: QueryWith(
Filesize(RangeWithin{24, 64}),
Filename("kustomization.yaml"),
Keyword("keyword1"),
Keyword("keyword2"),
),
expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2",
},
}

for _, test := range testCases {
if queryStr := test.query.String(); queryStr != test.expected {
t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected)
}

}
}

func TestGithubSearchQuery(t *testing.T) {
const (
accessToken = "random_token"
perPage = 100
)

testCases := []struct {
rc RequestConfig
codeQuery Query
fullRepoName string
path string
expectedCodeQuery string
expectedContentsQuery string
expectedCommitsQuery string
}{
{
rc: RequestConfig{
perPage: perPage,
accessToken: accessToken,
},
codeQuery: Query{
Filename("kustomization.yaml"),
Filesize(RangeWithin{64, 128}),
},
fullRepoName: "kubernetes-sigs/kustomize",
path: "examples/helloWorld/kustomization.yaml",

expectedCodeQuery: "https://api.github.com/search/code?" +
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128",

expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100",

expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml",
},
}

for _, test := range testCases {
if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery {
t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery)
}

if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery {
t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery)
}
if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery {
t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery)
}
}
}

0 comments on commit ac6918d

Please sign in to comment.