-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e9ee9ab
commit dca79b0
Showing
5 changed files
with
246 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
package mlutils |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
//ref: https://github.com/sausheong/gonb | ||
|
||
package naive_bayes | ||
|
||
import ( | ||
"bytes" | ||
"encoding/gob" | ||
"io" | ||
"os" | ||
"regexp" | ||
"sort" | ||
"strings" | ||
|
||
"github.com/kljensen/snowball" | ||
) | ||
|
||
var ( | ||
cleaner = regexp.MustCompile(`[^\w\s]`) | ||
stopWords = map[string]struct{}{"a": {}, "able": {}, "about": {}, "above": {}, "abroad": {}, "according": {}, "accordingly": {}, "across": {}, "actually": {}, "adj": {}, "after": {}, "afterwards": {}, "again": {}, "against": {}, "ago": {}, "ahead": {}, "ain't": {}, "all": {}, "allow": {}, "allows": {}, "almost": {}, "alone": {}, "along": {}, "alongside": {}, "already": {}, "also": {}, "although": {}, "always": {}, "am": {}, "amid": {}, "amidst": {}, "among": {}, "amongst": {}, "an": {}, "and": {}, "another": {}, "any": {}, "anybody": {}, "anyhow": {}, "anyone": {}, "anything": {}, "anyway": {}, "anyways": {}, "anywhere": {}, "apart": {}, "appear": {}, "appreciate": {}, "appropriate": {}, "are": {}, "aren't": {}, "around": {}, "as": {}, "a's": {}, "aside": {}, "ask": {}, "asking": {}, "associated": {}, "at": {}, "available": {}, "away": {}, "awfully": {}, "b": {}, "back": {}, "backward": {}, "backwards": {}, "be": {}, "became": {}, "because": {}, "become": {}, "becomes": {}, "becoming": {}, "been": {}, "before": {}, "beforehand": {}, "begin": {}, "behind": {}, "being": {}, "believe": {}, "below": {}, "beside": {}, "besides": {}, "best": {}, "better": {}, "between": {}, "beyond": {}, "both": {}, "brief": {}, "but": {}, "by": {}, "c": {}, "came": {}, "can": {}, "cannot": {}, "cant": {}, "can't": {}, "caption": {}, "cause": {}, "causes": {}, "certain": {}, "certainly": {}, "changes": {}, "clearly": {}, "c'mon": {}, "co": {}, "co.": {}, "com": {}, "come": {}, "comes": {}, "concerning": {}, "consequently": {}, "consider": {}, "considering": {}, "contain": {}, "containing": {}, "contains": {}, "corresponding": {}, "could": {}, "couldn't": {}, "course": {}, "c's": {}, "currently": {}, "d": {}, "dare": {}, "daren't": {}, "definitely": {}, "described": {}, "despite": {}, "did": {}, "didn't": {}, "different": {}, "directly": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "done": {}, "don't": {}, "down": {}, "downwards": {}, "during": {}, "e": {}, "each": {}, "edu": {}, "eg": {}, "eight": {}, "eighty": {}, "either": {}, "else": {}, "elsewhere": {}, "end": {}, "ending": {}, "enough": {}, "entirely": {}, "especially": {}, "et": {}, "etc": {}, "even": {}, "ever": {}, "evermore": {}, "every": {}, "everybody": {}, "everyone": {}, "everything": {}, "everywhere": {}, "ex": {}, "exactly": {}, "example": {}, "except": {}, "f": {}, "fairly": {}, "far": {}, "farther": {}, "few": {}, "fewer": {}, "fifth": {}, "first": {}, "five": {}, "followed": {}, "following": {}, "follows": {}, "for": {}, "forever": {}, "former": {}, "formerly": {}, "forth": {}, "forward": {}, "found": {}, "four": {}, "from": {}, "further": {}, "furthermore": {}, "g": {}, "get": {}, "gets": {}, "getting": {}, "given": {}, "gives": {}, "go": {}, "goes": {}, "going": {}, "gone": {}, "got": {}, "gotten": {}, "greetings": {}, "h": {}, "had": {}, "hadn't": {}, "half": {}, "happens": {}, "hardly": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "hello": {}, "help": {}, "hence": {}, "her": {}, "here": {}, "hereafter": {}, "hereby": {}, "herein": {}, "here's": {}, "hereupon": {}, "hers": {}, "herself": {}, "he's": {}, "hi": {}, "him": {}, "himself": {}, "his": {}, "hither": {}, "hopefully": {}, "how": {}, "howbeit": {}, "however": {}, "hundred": {}, "i": {}, "i'd": {}, "ie": {}, "if": {}, "ignored": {}, "i'll": {}, "i'm": {}, "immediate": {}, "in": {}, "inasmuch": {}, "inc": {}, "inc.": {}, "indeed": {}, "indicate": {}, "indicated": {}, "indicates": {}, "inner": {}, "inside": {}, "insofar": {}, "instead": {}, "into": {}, "inward": {}, "is": {}, "isn't": {}, "it": {}, "it'd": {}, "it'll": {}, "its": {}, "it's": {}, "itself": {}, "i've": {}, "j": {}, "just": {}, "k": {}, "keep": {}, "keeps": {}, "kept": {}, "know": {}, "known": {}, "knows": {}, "l": {}, "last": {}, "lately": {}, "later": {}, "latter": {}, "latterly": {}, "least": {}, "less": {}, "lest": {}, "let": {}, "let's": {}, "like": {}, "liked": {}, "likely": {}, "likewise": {}, "little": {}, "look": {}, "looking": {}, "looks": {}, "low": {}, "lower": {}, "ltd": {}, "m": {}, "made": {}, "mainly": {}, "make": {}, "makes": {}, "many": {}, "may": {}, "maybe": {}, "mayn't": {}, "me": {}, "mean": {}, "meantime": {}, "meanwhile": {}, "merely": {}, "might": {}, "mightn't": {}, "mine": {}, "minus": {}, "miss": {}, "more": {}, "moreover": {}, "most": {}, "mostly": {}, "mr": {}, "mrs": {}, "much": {}, "must": {}, "mustn't": {}, "my": {}, "myself": {}, "n": {}, "name": {}, "namely": {}, "nd": {}, "near": {}, "nearly": {}, "necessary": {}, "need": {}, "needn't": {}, "needs": {}, "neither": {}, "never": {}, "neverf": {}, "neverless": {}, "nevertheless": {}, "new": {}, "next": {}, "nine": {}, "ninety": {}, "no": {}, "nobody": {}, "non": {}, "none": {}, "nonetheless": {}, "noone": {}, "no-one": {}, "nor": {}, "normally": {}, "not": {}, "nothing": {}, "notwithstanding": {}, "novel": {}, "now": {}, "nowhere": {}, "o": {}, "obviously": {}, "of": {}, "off": {}, "often": {}, "oh": {}, "ok": {}, "okay": {}, "old": {}, "on": {}, "once": {}, "one": {}, "ones": {}, "one's": {}, "only": {}, "onto": {}, "opposite": {}, "or": {}, "other": {}, "others": {}, "otherwise": {}, "ought": {}, "oughtn't": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "outside": {}, "over": {}, "overall": {}, "own": {}, "p": {}, "particular": {}, "particularly": {}, "past": {}, "per": {}, "perhaps": {}, "placed": {}, "please": {}, "plus": {}, "possible": {}, "presumably": {}, "probably": {}, "provided": {}, "provides": {}, "q": {}, "que": {}, "quite": {}, "qv": {}, "r": {}, "rather": {}, "rd": {}, "re": {}, "really": {}, "reasonably": {}, "recent": {}, "recently": {}, "regarding": {}, "regardless": {}, "regards": {}, "relatively": {}, "respectively": {}, "right": {}, "round": {}, "s": {}, "said": {}, "same": {}, "saw": {}, "say": {}, "saying": {}, "says": {}, "second": {}, "secondly": {}, "see": {}, "seeing": {}, "seem": {}, "seemed": {}, "seeming": {}, "seems": {}, "seen": {}, "self": {}, "selves": {}, "sensible": {}, "sent": {}, "serious": {}, "seriously": {}, "seven": {}, "several": {}, "shall": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "since": {}, "six": {}, "so": {}, "some": {}, "somebody": {}, "someday": {}, "somehow": {}, "someone": {}, "something": {}, "sometime": {}, "sometimes": {}, "somewhat": {}, "somewhere": {}, "soon": {}, "sorry": {}, "specified": {}, "specify": {}, "specifying": {}, "still": {}, "sub": {}, "such": {}, "sup": {}, "sure": {}, "t": {}, "take": {}, "taken": {}, "taking": {}, "tell": {}, "tends": {}, "th": {}, "than": {}, "thank": {}, "thanks": {}, "thanx": {}, "that": {}, "that'll": {}, "thats": {}, "that's": {}, "that've": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "thence": {}, "there": {}, "thereafter": {}, "thereby": {}, "there'd": {}, "therefore": {}, "therein": {}, "there'll": {}, "there're": {}, "theres": {}, "there's": {}, "thereupon": {}, "there've": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "thing": {}, "things": {}, "think": {}, "third": {}, "thirty": {}, "this": {}, "thorough": {}, "thoroughly": {}, "those": {}, "though": {}, "three": {}, "through": {}, "throughout": {}, "thru": {}, "thus": {}, "till": {}, "to": {}, "together": {}, "too": {}, "took": {}, "toward": {}, "towards": {}, "tried": {}, "tries": {}, "truly": {}, "try": {}, "trying": {}, "t's": {}, "twice": {}, "two": {}, "u": {}, "un": {}, "under": {}, "underneath": {}, "undoing": {}, "unfortunately": {}, "unless": {}, "unlike": {}, "unlikely": {}, "until": {}, "unto": {}, "up": {}, "upon": {}, "upwards": {}, "us": {}, "use": {}, "used": {}, "useful": {}, "uses": {}, "using": {}, "usually": {}, "v": {}, "value": {}, "various": {}, "versus": {}, "very": {}, "via": {}, "viz": {}, "vs": {}, "w": {}, "want": {}, "wants": {}, "was": {}, "wasn't": {}, "way": {}, "we": {}, "we'd": {}, "welcome": {}, "well": {}, "we'll": {}, "went": {}, "were": {}, "we're": {}, "weren't": {}, "we've": {}, "what": {}, "whatever": {}, "what'll": {}, "what's": {}, "what've": {}, "when": {}, "whence": {}, "whenever": {}, "where": {}, "whereafter": {}, "whereas": {}, "whereby": {}, "wherein": {}, "where's": {}, "whereupon": {}, "wherever": {}, "whether": {}, "which": {}, "whichever": {}, "while": {}, "whilst": {}, "whither": {}, "who": {}, "who'd": {}, "whoever": {}, "whole": {}, "who'll": {}, "whom": {}, "whomever": {}, "who's": {}, "whose": {}, "why": {}, "will": {}, "willing": {}, "wish": {}, "with": {}, "within": {}, "without": {}, "wonder": {}, "won't": {}, "would": {}, "wouldn't": {}, "x": {}, "y": {}, "yes": {}, "yet": {}, "you": {}, "you'd": {}, "you'll": {}, "your": {}, "you're": {}, "yours": {}, "yourself": {}, "yourselves": {}, "you've": {}, "z": {}, "zero": {}} | ||
) | ||
|
||
type Sorted struct { | ||
Category string | ||
Probability float64 | ||
} | ||
|
||
// NaiveBayesClassifier is what we use to classify documents | ||
type NaiveBayesClassifier struct { | ||
Words map[string]map[string]int | ||
TotalWords int | ||
CategoriesDocuments map[string]int | ||
TotalDocuments int | ||
CategoriesWords map[string]int | ||
Threshold float64 | ||
} | ||
|
||
// create and initialize the classifier | ||
func New(threshold float64) *NaiveBayesClassifier { | ||
classifier := &NaiveBayesClassifier{ | ||
Words: make(map[string]map[string]int), | ||
TotalWords: 0, | ||
CategoriesDocuments: make(map[string]int), | ||
TotalDocuments: 0, | ||
CategoriesWords: make(map[string]int), | ||
Threshold: threshold, | ||
} | ||
return classifier | ||
} | ||
|
||
// create and initialize the classifier from a file | ||
func NewClassifierFromFile(path string) (*NaiveBayesClassifier, error) { | ||
classifier := &NaiveBayesClassifier{} | ||
|
||
fl, err := os.Open(path) | ||
if err != nil { | ||
return classifier, err | ||
} | ||
defer fl.Close() | ||
|
||
return NewClassifierWithReader(fl) | ||
} | ||
|
||
// create and initialize the classifier from a file data | ||
func NewClassifierFromFileData(data []byte) (*NaiveBayesClassifier, error) { | ||
return NewClassifierWithReader(bytes.NewReader(data)) | ||
} | ||
|
||
// create and initialize the classifier from a file data | ||
func NewClassifierWithReader(reader io.Reader) (*NaiveBayesClassifier, error) { | ||
classifier := &NaiveBayesClassifier{} | ||
err := gob.NewDecoder(reader).Decode(classifier) | ||
if err != nil { | ||
return classifier, err | ||
} | ||
|
||
return classifier, nil | ||
} | ||
|
||
// save the classifier to a file | ||
func (c *NaiveBayesClassifier) SaveClassifierToFile(path string) error { | ||
fl, err := os.Create(path) | ||
if err != nil { | ||
return err | ||
} | ||
defer fl.Close() | ||
|
||
err = gob.NewEncoder(fl).Encode(&c) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// Train the classifier | ||
func (c *NaiveBayesClassifier) fit(category string, document string) { | ||
for word, count := range countWords(document) { | ||
c.Words[category][word] += count | ||
c.CategoriesWords[category] += count | ||
c.TotalWords += count | ||
} | ||
c.CategoriesDocuments[category]++ | ||
c.TotalDocuments++ | ||
} | ||
|
||
func (c *NaiveBayesClassifier) Fit(data map[string][]string) { | ||
for category, documents := range data { | ||
c.Words[category] = make(map[string]int) | ||
c.CategoriesDocuments[category] = 0 | ||
c.CategoriesWords[category] = 0 | ||
|
||
for _, document := range documents { | ||
c.fit(category, document) | ||
} | ||
} | ||
} | ||
|
||
// Classify a document | ||
func (c *NaiveBayesClassifier) Classify(document string) (category string) { | ||
// get all the probabilities of each category | ||
prob := c.Probabilities(document) | ||
|
||
// sort the categories according to probabilities | ||
var sp []Sorted | ||
for c, p := range prob { | ||
sp = append(sp, Sorted{c, p}) | ||
} | ||
sort.Slice(sp, func(i, j int) bool { | ||
return sp[i].Probability > sp[j].Probability | ||
}) | ||
|
||
// if the highest probability is above threshold select that | ||
if sp[0].Probability/sp[1].Probability > c.Threshold { | ||
category = sp[0].Category | ||
} else { | ||
category = "other" | ||
} | ||
|
||
return | ||
} | ||
|
||
// Probabilities of each category | ||
func (c *NaiveBayesClassifier) Probabilities(document string) (p map[string]float64) { | ||
p = make(map[string]float64) | ||
for category := range c.Words { | ||
p[category] = c.pCategoryDocument(category, document) | ||
} | ||
return | ||
} | ||
|
||
// p (document | category) | ||
func (c *NaiveBayesClassifier) pDocumentCategory(category string, document string) (p float64) { | ||
p = 1.0 | ||
for word := range countWords(document) { | ||
p = p * c.pWordCategory(category, word) | ||
} | ||
return p | ||
} | ||
|
||
func (c *NaiveBayesClassifier) pWordCategory(category string, word string) float64 { | ||
return float64(c.Words[category][stem(word)]+1) / float64(c.CategoriesWords[category]) | ||
} | ||
|
||
// p (category) | ||
func (c *NaiveBayesClassifier) pCategory(category string) float64 { | ||
return float64(c.CategoriesDocuments[category]) / float64(c.TotalDocuments) | ||
} | ||
|
||
// p (category | document) | ||
func (c *NaiveBayesClassifier) pCategoryDocument(category string, document string) float64 { | ||
return c.pDocumentCategory(category, document) * c.pCategory(category) | ||
} | ||
|
||
// clean up and split words in document, then stem each word and count the occurrence | ||
func countWords(document string) (wordCount map[string]int) { | ||
cleaned := cleanDocument(document) | ||
words := strings.Split(cleaned, " ") | ||
wordCount = make(map[string]int) | ||
for _, word := range words { | ||
if _, ok := stopWords[word]; !ok { | ||
key := stem(strings.ToLower(word)) | ||
wordCount[key]++ | ||
} | ||
} | ||
return | ||
} | ||
|
||
func cleanDocument(text string) string { | ||
return cleaner.ReplaceAllString(text, "") | ||
} | ||
|
||
// stem a word using the Snowball algorithm | ||
func stem(word string) string { | ||
stemmed, err := snowball.Stem(word, "english", true) | ||
if err == nil { | ||
return stemmed | ||
} | ||
// fmt.Println("Cannot stem word:", word) | ||
return word | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package naive_bayes | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestNaiveBayesClassifier(t *testing.T) { | ||
|
||
// Create a new Naive Bayes Classifier | ||
threshold := 1.1 | ||
nb := New(threshold) | ||
|
||
// Create a new training set | ||
trainingSet := map[string][]string{ | ||
"Baseball": { | ||
"Pitcher", | ||
"Shortstop", | ||
"Outfield", | ||
}, | ||
"Basketball": { | ||
"Point Guard", | ||
"Shooting Guard", | ||
"Small Forward", | ||
"Power Forward", | ||
"Center", | ||
}, | ||
"Soccer": { | ||
"Goalkeeper", | ||
"Defender", | ||
"Midfielder", | ||
"Forward", | ||
}, | ||
} | ||
|
||
// Train the classifier | ||
nb.Fit(trainingSet) | ||
|
||
//then | ||
assert.Equal(t, nb.Classify("Point guard"), "Basketball") | ||
} |