Skip to content

Commit

Permalink
add mlutils
Browse files Browse the repository at this point in the history
  • Loading branch information
dogancanbakir committed Jul 10, 2023
1 parent e9ee9ab commit dca79b0
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 0 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ require (
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/golang/snappy v0.0.1 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/kljensen/snowball v0.8.0 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/kljensen/snowball v0.8.0 h1:WU4cExxK6sNW33AiGdbn4e8RvloHrhkAssu2mVJ11kg=
github.com/kljensen/snowball v0.8.0/go.mod h1:OGo5gFWjaeXqCu4iIrMl5OYip9XUJHGOU5eSkPjVg2A=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
Expand Down
1 change: 1 addition & 0 deletions mlutils/mlutils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package mlutils
200 changes: 200 additions & 0 deletions mlutils/naive_bayes/naive_bayes_classifier.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
//ref: https://github.com/sausheong/gonb

package naive_bayes

import (
"bytes"
"encoding/gob"
"io"
"os"
"regexp"
"sort"
"strings"

"github.com/kljensen/snowball"
)

var (
cleaner = regexp.MustCompile(`[^\w\s]`)
stopWords = map[string]struct{}{"a": {}, "able": {}, "about": {}, "above": {}, "abroad": {}, "according": {}, "accordingly": {}, "across": {}, "actually": {}, "adj": {}, "after": {}, "afterwards": {}, "again": {}, "against": {}, "ago": {}, "ahead": {}, "ain't": {}, "all": {}, "allow": {}, "allows": {}, "almost": {}, "alone": {}, "along": {}, "alongside": {}, "already": {}, "also": {}, "although": {}, "always": {}, "am": {}, "amid": {}, "amidst": {}, "among": {}, "amongst": {}, "an": {}, "and": {}, "another": {}, "any": {}, "anybody": {}, "anyhow": {}, "anyone": {}, "anything": {}, "anyway": {}, "anyways": {}, "anywhere": {}, "apart": {}, "appear": {}, "appreciate": {}, "appropriate": {}, "are": {}, "aren't": {}, "around": {}, "as": {}, "a's": {}, "aside": {}, "ask": {}, "asking": {}, "associated": {}, "at": {}, "available": {}, "away": {}, "awfully": {}, "b": {}, "back": {}, "backward": {}, "backwards": {}, "be": {}, "became": {}, "because": {}, "become": {}, "becomes": {}, "becoming": {}, "been": {}, "before": {}, "beforehand": {}, "begin": {}, "behind": {}, "being": {}, "believe": {}, "below": {}, "beside": {}, "besides": {}, "best": {}, "better": {}, "between": {}, "beyond": {}, "both": {}, "brief": {}, "but": {}, "by": {}, "c": {}, "came": {}, "can": {}, "cannot": {}, "cant": {}, "can't": {}, "caption": {}, "cause": {}, "causes": {}, "certain": {}, "certainly": {}, "changes": {}, "clearly": {}, "c'mon": {}, "co": {}, "co.": {}, "com": {}, "come": {}, "comes": {}, "concerning": {}, "consequently": {}, "consider": {}, "considering": {}, "contain": {}, "containing": {}, "contains": {}, "corresponding": {}, "could": {}, "couldn't": {}, "course": {}, "c's": {}, "currently": {}, "d": {}, "dare": {}, "daren't": {}, "definitely": {}, "described": {}, "despite": {}, "did": {}, "didn't": {}, "different": {}, "directly": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "done": {}, "don't": {}, "down": {}, "downwards": {}, "during": {}, "e": {}, "each": {}, "edu": {}, "eg": {}, "eight": {}, "eighty": {}, "either": {}, "else": {}, "elsewhere": {}, "end": {}, "ending": {}, "enough": {}, "entirely": {}, "especially": {}, "et": {}, "etc": {}, "even": {}, "ever": {}, "evermore": {}, "every": {}, "everybody": {}, "everyone": {}, "everything": {}, "everywhere": {}, "ex": {}, "exactly": {}, "example": {}, "except": {}, "f": {}, "fairly": {}, "far": {}, "farther": {}, "few": {}, "fewer": {}, "fifth": {}, "first": {}, "five": {}, "followed": {}, "following": {}, "follows": {}, "for": {}, "forever": {}, "former": {}, "formerly": {}, "forth": {}, "forward": {}, "found": {}, "four": {}, "from": {}, "further": {}, "furthermore": {}, "g": {}, "get": {}, "gets": {}, "getting": {}, "given": {}, "gives": {}, "go": {}, "goes": {}, "going": {}, "gone": {}, "got": {}, "gotten": {}, "greetings": {}, "h": {}, "had": {}, "hadn't": {}, "half": {}, "happens": {}, "hardly": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "hello": {}, "help": {}, "hence": {}, "her": {}, "here": {}, "hereafter": {}, "hereby": {}, "herein": {}, "here's": {}, "hereupon": {}, "hers": {}, "herself": {}, "he's": {}, "hi": {}, "him": {}, "himself": {}, "his": {}, "hither": {}, "hopefully": {}, "how": {}, "howbeit": {}, "however": {}, "hundred": {}, "i": {}, "i'd": {}, "ie": {}, "if": {}, "ignored": {}, "i'll": {}, "i'm": {}, "immediate": {}, "in": {}, "inasmuch": {}, "inc": {}, "inc.": {}, "indeed": {}, "indicate": {}, "indicated": {}, "indicates": {}, "inner": {}, "inside": {}, "insofar": {}, "instead": {}, "into": {}, "inward": {}, "is": {}, "isn't": {}, "it": {}, "it'd": {}, "it'll": {}, "its": {}, "it's": {}, "itself": {}, "i've": {}, "j": {}, "just": {}, "k": {}, "keep": {}, "keeps": {}, "kept": {}, "know": {}, "known": {}, "knows": {}, "l": {}, "last": {}, "lately": {}, "later": {}, "latter": {}, "latterly": {}, "least": {}, "less": {}, "lest": {}, "let": {}, "let's": {}, "like": {}, "liked": {}, "likely": {}, "likewise": {}, "little": {}, "look": {}, "looking": {}, "looks": {}, "low": {}, "lower": {}, "ltd": {}, "m": {}, "made": {}, "mainly": {}, "make": {}, "makes": {}, "many": {}, "may": {}, "maybe": {}, "mayn't": {}, "me": {}, "mean": {}, "meantime": {}, "meanwhile": {}, "merely": {}, "might": {}, "mightn't": {}, "mine": {}, "minus": {}, "miss": {}, "more": {}, "moreover": {}, "most": {}, "mostly": {}, "mr": {}, "mrs": {}, "much": {}, "must": {}, "mustn't": {}, "my": {}, "myself": {}, "n": {}, "name": {}, "namely": {}, "nd": {}, "near": {}, "nearly": {}, "necessary": {}, "need": {}, "needn't": {}, "needs": {}, "neither": {}, "never": {}, "neverf": {}, "neverless": {}, "nevertheless": {}, "new": {}, "next": {}, "nine": {}, "ninety": {}, "no": {}, "nobody": {}, "non": {}, "none": {}, "nonetheless": {}, "noone": {}, "no-one": {}, "nor": {}, "normally": {}, "not": {}, "nothing": {}, "notwithstanding": {}, "novel": {}, "now": {}, "nowhere": {}, "o": {}, "obviously": {}, "of": {}, "off": {}, "often": {}, "oh": {}, "ok": {}, "okay": {}, "old": {}, "on": {}, "once": {}, "one": {}, "ones": {}, "one's": {}, "only": {}, "onto": {}, "opposite": {}, "or": {}, "other": {}, "others": {}, "otherwise": {}, "ought": {}, "oughtn't": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "outside": {}, "over": {}, "overall": {}, "own": {}, "p": {}, "particular": {}, "particularly": {}, "past": {}, "per": {}, "perhaps": {}, "placed": {}, "please": {}, "plus": {}, "possible": {}, "presumably": {}, "probably": {}, "provided": {}, "provides": {}, "q": {}, "que": {}, "quite": {}, "qv": {}, "r": {}, "rather": {}, "rd": {}, "re": {}, "really": {}, "reasonably": {}, "recent": {}, "recently": {}, "regarding": {}, "regardless": {}, "regards": {}, "relatively": {}, "respectively": {}, "right": {}, "round": {}, "s": {}, "said": {}, "same": {}, "saw": {}, "say": {}, "saying": {}, "says": {}, "second": {}, "secondly": {}, "see": {}, "seeing": {}, "seem": {}, "seemed": {}, "seeming": {}, "seems": {}, "seen": {}, "self": {}, "selves": {}, "sensible": {}, "sent": {}, "serious": {}, "seriously": {}, "seven": {}, "several": {}, "shall": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "since": {}, "six": {}, "so": {}, "some": {}, "somebody": {}, "someday": {}, "somehow": {}, "someone": {}, "something": {}, "sometime": {}, "sometimes": {}, "somewhat": {}, "somewhere": {}, "soon": {}, "sorry": {}, "specified": {}, "specify": {}, "specifying": {}, "still": {}, "sub": {}, "such": {}, "sup": {}, "sure": {}, "t": {}, "take": {}, "taken": {}, "taking": {}, "tell": {}, "tends": {}, "th": {}, "than": {}, "thank": {}, "thanks": {}, "thanx": {}, "that": {}, "that'll": {}, "thats": {}, "that's": {}, "that've": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "thence": {}, "there": {}, "thereafter": {}, "thereby": {}, "there'd": {}, "therefore": {}, "therein": {}, "there'll": {}, "there're": {}, "theres": {}, "there's": {}, "thereupon": {}, "there've": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "thing": {}, "things": {}, "think": {}, "third": {}, "thirty": {}, "this": {}, "thorough": {}, "thoroughly": {}, "those": {}, "though": {}, "three": {}, "through": {}, "throughout": {}, "thru": {}, "thus": {}, "till": {}, "to": {}, "together": {}, "too": {}, "took": {}, "toward": {}, "towards": {}, "tried": {}, "tries": {}, "truly": {}, "try": {}, "trying": {}, "t's": {}, "twice": {}, "two": {}, "u": {}, "un": {}, "under": {}, "underneath": {}, "undoing": {}, "unfortunately": {}, "unless": {}, "unlike": {}, "unlikely": {}, "until": {}, "unto": {}, "up": {}, "upon": {}, "upwards": {}, "us": {}, "use": {}, "used": {}, "useful": {}, "uses": {}, "using": {}, "usually": {}, "v": {}, "value": {}, "various": {}, "versus": {}, "very": {}, "via": {}, "viz": {}, "vs": {}, "w": {}, "want": {}, "wants": {}, "was": {}, "wasn't": {}, "way": {}, "we": {}, "we'd": {}, "welcome": {}, "well": {}, "we'll": {}, "went": {}, "were": {}, "we're": {}, "weren't": {}, "we've": {}, "what": {}, "whatever": {}, "what'll": {}, "what's": {}, "what've": {}, "when": {}, "whence": {}, "whenever": {}, "where": {}, "whereafter": {}, "whereas": {}, "whereby": {}, "wherein": {}, "where's": {}, "whereupon": {}, "wherever": {}, "whether": {}, "which": {}, "whichever": {}, "while": {}, "whilst": {}, "whither": {}, "who": {}, "who'd": {}, "whoever": {}, "whole": {}, "who'll": {}, "whom": {}, "whomever": {}, "who's": {}, "whose": {}, "why": {}, "will": {}, "willing": {}, "wish": {}, "with": {}, "within": {}, "without": {}, "wonder": {}, "won't": {}, "would": {}, "wouldn't": {}, "x": {}, "y": {}, "yes": {}, "yet": {}, "you": {}, "you'd": {}, "you'll": {}, "your": {}, "you're": {}, "yours": {}, "yourself": {}, "yourselves": {}, "you've": {}, "z": {}, "zero": {}}
)

type Sorted struct {
Category string
Probability float64
}

// NaiveBayesClassifier is what we use to classify documents
type NaiveBayesClassifier struct {
Words map[string]map[string]int
TotalWords int
CategoriesDocuments map[string]int
TotalDocuments int
CategoriesWords map[string]int
Threshold float64
}

// create and initialize the classifier
func New(threshold float64) *NaiveBayesClassifier {
classifier := &NaiveBayesClassifier{
Words: make(map[string]map[string]int),
TotalWords: 0,
CategoriesDocuments: make(map[string]int),
TotalDocuments: 0,
CategoriesWords: make(map[string]int),
Threshold: threshold,
}
return classifier
}

// create and initialize the classifier from a file
func NewClassifierFromFile(path string) (*NaiveBayesClassifier, error) {
classifier := &NaiveBayesClassifier{}

fl, err := os.Open(path)
if err != nil {
return classifier, err
}
defer fl.Close()

return NewClassifierWithReader(fl)
}

// create and initialize the classifier from a file data
func NewClassifierFromFileData(data []byte) (*NaiveBayesClassifier, error) {
return NewClassifierWithReader(bytes.NewReader(data))
}

// create and initialize the classifier from a file data
func NewClassifierWithReader(reader io.Reader) (*NaiveBayesClassifier, error) {
classifier := &NaiveBayesClassifier{}
err := gob.NewDecoder(reader).Decode(classifier)
if err != nil {
return classifier, err
}

return classifier, nil
}

// save the classifier to a file
func (c *NaiveBayesClassifier) SaveClassifierToFile(path string) error {
fl, err := os.Create(path)
if err != nil {
return err
}
defer fl.Close()

err = gob.NewEncoder(fl).Encode(&c)
if err != nil {
return err
}

return nil
}

// Train the classifier
func (c *NaiveBayesClassifier) fit(category string, document string) {
for word, count := range countWords(document) {
c.Words[category][word] += count
c.CategoriesWords[category] += count
c.TotalWords += count
}
c.CategoriesDocuments[category]++
c.TotalDocuments++
}

func (c *NaiveBayesClassifier) Fit(data map[string][]string) {
for category, documents := range data {
c.Words[category] = make(map[string]int)
c.CategoriesDocuments[category] = 0
c.CategoriesWords[category] = 0

for _, document := range documents {
c.fit(category, document)
}
}
}

// Classify a document
func (c *NaiveBayesClassifier) Classify(document string) (category string) {
// get all the probabilities of each category
prob := c.Probabilities(document)

// sort the categories according to probabilities
var sp []Sorted
for c, p := range prob {
sp = append(sp, Sorted{c, p})
}
sort.Slice(sp, func(i, j int) bool {
return sp[i].Probability > sp[j].Probability
})

// if the highest probability is above threshold select that
if sp[0].Probability/sp[1].Probability > c.Threshold {
category = sp[0].Category
} else {
category = "other"
}

return
}

// Probabilities of each category
func (c *NaiveBayesClassifier) Probabilities(document string) (p map[string]float64) {
p = make(map[string]float64)
for category := range c.Words {
p[category] = c.pCategoryDocument(category, document)
}
return
}

// p (document | category)
func (c *NaiveBayesClassifier) pDocumentCategory(category string, document string) (p float64) {
p = 1.0
for word := range countWords(document) {
p = p * c.pWordCategory(category, word)
}
return p
}

func (c *NaiveBayesClassifier) pWordCategory(category string, word string) float64 {
return float64(c.Words[category][stem(word)]+1) / float64(c.CategoriesWords[category])
}

// p (category)
func (c *NaiveBayesClassifier) pCategory(category string) float64 {
return float64(c.CategoriesDocuments[category]) / float64(c.TotalDocuments)
}

// p (category | document)
func (c *NaiveBayesClassifier) pCategoryDocument(category string, document string) float64 {
return c.pDocumentCategory(category, document) * c.pCategory(category)
}

// clean up and split words in document, then stem each word and count the occurrence
func countWords(document string) (wordCount map[string]int) {
cleaned := cleanDocument(document)
words := strings.Split(cleaned, " ")
wordCount = make(map[string]int)
for _, word := range words {
if _, ok := stopWords[word]; !ok {
key := stem(strings.ToLower(word))
wordCount[key]++
}
}
return
}

func cleanDocument(text string) string {
return cleaner.ReplaceAllString(text, "")
}

// stem a word using the Snowball algorithm
func stem(word string) string {
stemmed, err := snowball.Stem(word, "english", true)
if err == nil {
return stemmed
}
// fmt.Println("Cannot stem word:", word)
return word
}
42 changes: 42 additions & 0 deletions mlutils/naive_bayes/naive_bayes_classifier_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package naive_bayes

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestNaiveBayesClassifier(t *testing.T) {

// Create a new Naive Bayes Classifier
threshold := 1.1
nb := New(threshold)

// Create a new training set
trainingSet := map[string][]string{
"Baseball": {
"Pitcher",
"Shortstop",
"Outfield",
},
"Basketball": {
"Point Guard",
"Shooting Guard",
"Small Forward",
"Power Forward",
"Center",
},
"Soccer": {
"Goalkeeper",
"Defender",
"Midfielder",
"Forward",
},
}

// Train the classifier
nb.Fit(trainingSet)

//then
assert.Equal(t, nb.Classify("Point guard"), "Basketball")
}

0 comments on commit dca79b0

Please sign in to comment.