-
Notifications
You must be signed in to change notification settings - Fork 0
/
Logistic_Word2Vec.py
128 lines (97 loc) · 4.23 KB
/
Logistic_Word2Vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
# coding: utf-8
# In[1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
#from gensim.sklearn_api import W2VTransformer
import nltk as nl
import gensim
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn import utils
from gensim.models.doc2vec import TaggedDocument
nl.download()
# In[2]:
data_after_outlier1 = pd.read_csv("train_4.csv")
#data_after_outlier1 = data_after_outlier1.reset_index()
# In[3]:
from sklearn.model_selection import train_test_split
X = data_after_outlier1[["text"]]
y = data_after_outlier1['emotions_class']
#y = df_test_under_Likes['bin_class_Retweet']
#X_train, X_test, y_train, y_test = train_test_split(X[:5], y[:5], test_size=0.30, random_state=42)
# In[ ]:
# pipeline = Pipeline([
# ('vect', CountVectorizer()),
# ('tfidf',W2VTransformer(min_count = 2)),
# ('clf', LogisticRegression())
# ])
# grid_search = GridSearchCV(pipeline,param_grid = {'clf__solver': ('newton-cg', 'lbfgs', 'liblinear', 'saga')}, cv=10,
# n_jobs=-1, verbose=1)
# print("Performing grid search...")
# #if type() is str:
# # tweet = tweet.lower()
# grid_obj = grid_search.fit(list(X_train.text), list(y_train))
# print("Best score: %0.3f" % grid_obj.best_score_)
# y_true, y_pred = y_test, grid_obj.predict(list(X_test.text))
# accuracy_score(y_true,y_pred)
# # print('Model best estimator: {}'.format(grid_obj.best_estimator_))
# print(X_train)
# tokens = [nl.word_tokenize(sentences) for sentences in X_train.text]
# print(tokens)
# clf = gensim.models.Word2Vec(tokens, size=2, min_count=1, workers=4)
# print("\n Training the word2vec model...\n")
# # reducing the epochs will decrease the computation time
# #model.train(tokens, total_examples=2, epochs=10)
# clf.LogisticRegression( solver='saga', multi_class='multinomial')
# #print(model.wv.syn0)
# #print((y_train[:len(model.wv.syn0)]))
# clf.fit(X_train, y_train)
# y_true, y_pred = y_test, clf.predict((X_test.text))
# print("accuracy_score",accuracy_score(y_true,y_pred))
train, test = train_test_split(data_after_outlier1, test_size=0.3, random_state=42)
def tokenize_text(text):
tokens = []
for sent in nl.sent_tokenize(text):
for word in nl.word_tokenize(sent):
if len(word) < 2:
continue
tokens.append(word.lower())
return tokens
train_tagged = train.apply(
lambda r: TaggedDocument(words=tokenize_text(r.text), tags=[r.emotions_class]), axis=1)
test_tagged = test.apply(
lambda r: TaggedDocument(words=tokenize_text(r.text), tags=[r.emotions_class]), axis=1)
#train_tagged = list(train_tagged)
#test_tagged = list(test_tagged)
print(train_tagged.values)
#print(train_tagged.values)
model_dbow = gensim.models.Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
# y = []
# for x in tqdm(train_tagged.values):
# print(x)
# y.append(x)
# model_dbow.build_vocab(y)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])
for epoch in range(30):
model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged)]), total_examples=len(train_tagged), epochs=1)
model_dbow.alpha -= 0.002
model_dbow.min_alpha = model_dbow.alpha
def vec_for_learning(model, tagged_docs):
sents = tagged_docs
#print(sents)
targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
return targets, regressors
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))