-
Notifications
You must be signed in to change notification settings - Fork 0
/
Maths.py
104 lines (84 loc) · 4.55 KB
/
Maths.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#importing basic libraries
import re
import math
import numpy as np
#defining method to calculate angle between document and query
def calc_angle(x,y):
norm_x = np.linalg.norm(x) # document vector magnitude calculation
norm_y = np.linalg.norm(y) # query vector magnitude calculation
cos_theta = np.dot(x,y)/(norm_x*norm_y) # calculating cosine of the angle between query and document
theta = math.degrees(math.acos(cos_theta)) #inverse cosine to get the angle
return theta
#making the dictionary of the document
def make_dictionary(document):
dictionary = {}
with open(document, 'r') as file:
# .lower() returns a version with all upper case characters replaced with lower case characters.
text = file.read().lower()
# replaces anything that is not a lowercase letter, a space, or an apostrophe with a space:
text = re.sub('[^a-z\ \']+', " ", text) # For some reason, even though the text is in lower case, the code does't work unless i redo that condition
Words = list(text.split()) # put text into an empty list using split()
for i in Words:
if i in dictionary:
dictionary[i] += 1
else:
dictionary[i] = 1
return dictionary
# making the inverted index of the document
def make_invertedIndex(document):
inverted = {}
with open(document, 'r') as f:
lines = f.read().splitlines() # making a list of all documents seperated by a newline character
idx = 1 # maintaining the current document index
for docs in lines:
doc_words = list(docs.split()) # getting the words od each document in a list
# for each word in documents
for word in doc_words:
if word in inverted: # if the word exists in the inverted index
if idx not in inverted[word]: # if current document is not in the value of this word
inverted[word].append(idx) # add the current document as a value for the current word
else:
inverted[word] = [idx] # if word is not a key of invertedindex then make a new key
idx += 1;
return inverted
with open('docs.txt', 'r') as f:
lines = f.read().splitlines()
# making the dictionary
dictionary = make_dictionary('docs.txt')
print "Words in dictionary: " , len(dictionary)
# making the inverted index
inverted = make_invertedIndex('docs.txt')
# comparing with queries
with open('queries.txt', 'r') as f:
queries = f.read().splitlines() # getting individual queries from the quer.txt seperated by a newline
# for each query do the following
for query in queries:
print "Query: ", query
print "Relevant documents: " ,
query_words = list(query.split()) # split the query into individual words
doc_all_query = inverted[query_words[0]]# get all documents containing the first query word from inverted index
for idx,query_word in enumerate(query_words): # go through all the remaining words in the query
doc_this_query = inverted[query_words[idx]] # get all documents of the word
doc_all_query = [doc for doc in doc_all_query if doc in doc_this_query] # remove the documents that are not present in the next words
# now we have only those documents which contain all the words from the given query
print ' '.join(map(str, doc_all_query)) # print all those documents
# measuringthe similarity between each document we have got from above aginst the given query
angleDict = {}
queryDict = dict.fromkeys(dictionary, 0)
for i in query_words:
if i in queryDict:
queryDict[i] = 1 # creating a dictionary of all the query words
queryVec = np.fromiter(queryDict.values(),dtype=float) # changing this dictionary in a vector of only values as we dont need the keys now
# for each remaining document calculate this similarity
for docs in doc_all_query:
docDict = dict.fromkeys(dictionary, 0)
Words = list(lines[docs-1].split())
for i in Words:
if i in docDict:
docDict[i] += 1
docVec = np.fromiter(docDict.values(), dtype=float) # calculating the document vector now
angleDict[docs] = calc_angle(queryVec, docVec) # passing the vectors in the function to get the similarity angle
# sorting angles in ascending order and then printing
angleSorted = sorted(angleDict, key=angleDict.get, reverse=False)
for r in angleSorted:
print r, '{:.2f}'.format(round(angleDict[r], 2))