Skip to content

Commit

Permalink
Move all regexp MustCompiles outside (#510)
Browse files Browse the repository at this point in the history
Lots of CPU time spent compiling reused regexps in CPU profiles
  • Loading branch information
ericvolp12 committed Dec 29, 2023
2 parents 05273c6 + 1c8a8b9 commit 71e21e5
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 7 deletions.
4 changes: 3 additions & 1 deletion atproto/syntax/cid.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@ import (
// Always use [ParseCID] instead of wrapping strings directly, especially when working with network input.
type CID string

var cidRegex = regexp.MustCompile(`^[a-zA-Z0-9+=]{8,256}$`)

func ParseCID(raw string) (CID, error) {
if len(raw) > 256 {
return "", fmt.Errorf("CID is too long (256 chars max)")
}
if len(raw) < 8 {
return "", fmt.Errorf("CID is too short (8 chars min)")
}
var cidRegex = regexp.MustCompile(`^[a-zA-Z0-9+=]{8,256}$`)

if !cidRegex.MatchString(raw) {
return "", fmt.Errorf("CID syntax didn't validate via regex")
}
Expand Down
7 changes: 5 additions & 2 deletions atproto/syntax/datetime.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ const (
// Syntax is specified at: https://atproto.com/specs/lexicon#datetime
type Datetime string

var datetimeRegex = regexp.MustCompile(`^[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-6][0-9]:[0-6][0-9](.[0-9]{1,20})?(Z|([+-][0-2][0-9]:[0-5][0-9]))$`)

func ParseDatetime(raw string) (Datetime, error) {
if len(raw) > 64 {
return "", fmt.Errorf("Datetime too long (max 64 chars)")
}
var datetimeRegex = regexp.MustCompile(`^[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-6][0-9]:[0-6][0-9](.[0-9]{1,20})?(Z|([+-][0-2][0-9]:[0-5][0-9]))$`)

if !datetimeRegex.MatchString(raw) {
return "", fmt.Errorf("Datetime syntax didn't validate via regex")
}
Expand Down Expand Up @@ -53,6 +55,8 @@ func ParseDatetimeTime(raw string) (time.Time, error) {
// Similar to ParseDatetime, but more flexible about some parsing.
//
// Note that this may mutate the internal string, so a round-trip will fail. This is intended for working with legacy/broken records, not to be used in an ongoing way.
var hasTimezoneRegex = regexp.MustCompile(`^.*(([+-]\d\d:?\d\d)|[a-zA-Z])$`)

func ParseDatetimeLenient(raw string) (Datetime, error) {
// fast path: it is a valid overall datetime
valid, err := ParseDatetime(raw)
Expand All @@ -71,7 +75,6 @@ func ParseDatetimeLenient(raw string) (Datetime, error) {
}

// try adding timezone if it is missing
var hasTimezoneRegex = regexp.MustCompile(`^.*(([+-]\d\d:?\d\d)|[a-zA-Z])$`)
if !hasTimezoneRegex.MatchString(raw) {
withTZ, err := ParseDatetime(raw + "Z")
if nil == err {
Expand Down
3 changes: 2 additions & 1 deletion atproto/syntax/did.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ import (
// Syntax specification: https://atproto.com/specs/did
type DID string

var didRegex = regexp.MustCompile(`^did:[a-z]+:[a-zA-Z0-9._:%-]*[a-zA-Z0-9._-]$`)

func ParseDID(raw string) (DID, error) {
if len(raw) > 2*1024 {
return "", fmt.Errorf("DID is too long (2048 chars max)")
}
var didRegex = regexp.MustCompile(`^did:[a-z]+:[a-zA-Z0-9._:%-]*[a-zA-Z0-9._-]$`)
if !didRegex.MatchString(raw) {
return "", fmt.Errorf("DID syntax didn't validate via regex")
}
Expand Down
3 changes: 2 additions & 1 deletion atproto/syntax/language.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ import (
// The syntax is BCP-47. This is a partial/naive parsing implementation, designed for fast validation and exact-string passthrough with no normaliztion. For actually working with BCP-47 language specifiers in atproto code bases, we recommend the golang.org/x/text/language package.
type Language string

var langRegex = regexp.MustCompile(`^(i|[a-z]{2,3})(-[a-zA-Z0-9]+)*$`)

func ParseLanguage(raw string) (Language, error) {
if len(raw) > 128 {
return "", fmt.Errorf("Language is too long (128 chars max)")
}
var langRegex = regexp.MustCompile(`^(i|[a-z]{2,3})(-[a-zA-Z0-9]+)*$`)
if !langRegex.MatchString(raw) {
return "", fmt.Errorf("Language syntax didn't validate via regex")
}
Expand Down
3 changes: 2 additions & 1 deletion atproto/syntax/tid.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ func Base32Sort() *base32.Encoding {
// Syntax specification: https://atproto.com/specs/record-key
type TID string

var tidRegex = regexp.MustCompile(`^[234567abcdefghij][234567abcdefghijklmnopqrstuvwxyz]{12}$`)

func ParseTID(raw string) (TID, error) {
if len(raw) != 13 {
return "", fmt.Errorf("TID is wrong length (expected 13 chars)")
}
var tidRegex = regexp.MustCompile(`^[234567abcdefghij][234567abcdefghijklmnopqrstuvwxyz]{12}$`)
if !tidRegex.MatchString(raw) {
return "", fmt.Errorf("TID syntax didn't validate via regex")
}
Expand Down
3 changes: 2 additions & 1 deletion search/indexing.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ func (s *Server) deletePost(ctx context.Context, ident *identity.Identity, rkey
return nil
}

var tidRegex = regexp.MustCompile(`^[234567abcdefghijklmnopqrstuvwxyz]{13}$`)

func (s *Server) indexPost(ctx context.Context, ident *identity.Identity, rec *appbsky.FeedPost, path string, rcid cid.Cid) error {
ctx, span := tracer.Start(ctx, "indexPost")
defer span.End()
Expand All @@ -56,7 +58,6 @@ func (s *Server) indexPost(ctx context.Context, ident *identity.Identity, rec *a
log := s.logger.With("repo", ident.DID, "path", path, "op", "indexPost")
parts := strings.SplitN(path, "/", 3)
// TODO: replace with an atproto/syntax package type for TID
var tidRegex = regexp.MustCompile(`^[234567abcdefghijklmnopqrstuvwxyz]{13}$`)
if len(parts) != 2 || !tidRegex.MatchString(parts[1]) {
log.Warn("skipping index post record with weird path/TID", "did", ident.DID, "path", path)
return nil
Expand Down

0 comments on commit 71e21e5

Please sign in to comment.