Sei sulla pagina 1di 8

IR Assignment 1 21/08/18, 10(35 PM

In [1]: import glob


docs=[]
content=[]
doc_list=[]
i=0
directory= "../Information Retrieval/"
for files in glob.iglob(directory + "doc*.txt"):
infile = open(files)
a=infile.readline().split(' ')
doc_list.append(i)
i+=1
content+=a
print(a)
docs.append(a)
#for k in range (0,len(a)):
#print(a[0])
infile.close()
#The following printout is the corresponding doc no i.e. doc 0,1,2,3.

['july', 'new', 'home', 'sales', 'rise']


['new', 'home', 'sales', 'top', 'forecasts']
['increase', 'in', 'home', 'sales', 'in', 'july']
['home', 'sales', 'rise', 'in', 'july']

In [2]: #content contains all the words of the corpus


print(content)

['july', 'new', 'home', 'sales', 'rise', 'new', 'home', 'sales', 'to


p', 'forecasts', 'increase', 'in', 'home', 'sales', 'in', 'july', 'h
ome', 'sales', 'rise', 'in', 'july']

In [3]: #list of lists i.e. words occuring in different documents


print(docs)

[['july', 'new', 'home', 'sales', 'rise'], ['new', 'home', 'sales',


'top', 'forecasts'], ['increase', 'in', 'home', 'sales', 'in', 'july
'], ['home', 'sales', 'rise', 'in', 'july']]

In [4]: print(doc_list)

[0, 1, 2, 3]

http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 1 of 8
IR Assignment 1 21/08/18, 10(35 PM

In [5]: #dictionary for inverted index


# the data point at index 0 for every word in inverted_index is the freque
#e.g. in('rise', [2, 0, 3])~ 2 is frequency
#and 0,3 are the documents in which it occurs.
inv_indx = {i:[] for i in content}
for word in content:
if not inv_indx[word]:
cnt=0
for i in range(len(docs)):
if word in docs[i]:
inv_indx[word].append(i)
cnt+=1
inv_indx[word].insert(0,cnt)

In [6]: #print inverted index


for k,v in inv_indx.items():
print(k,v)
This is the frequency
('in', [2, 2, 3]) This is postings list
('top', [1, 1])
('rise', [2, 0, 3])
('sales', [4, 0, 1, 2, 3])
('forecasts', [1, 1])
('increase', [1, 2])
('home', [4, 0, 1, 2, 3])
('new', [2, 0, 1])
('july', [3, 0, 2, 3])

http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 2 of 8
IR Assignment 1 21/08/18, 10(35 PM

In [7]: #AND - merge algorithm


def intersect(inv_indx,w1,w2):
answer=[]
if w1 in inv_indx:
p1=inv_indx[w1]
print(p1)
else: print(w1+" not present in any doc")
if w2 in inv_indx:
p2=inv_indx[w2]
print(p2)
else: print(w2+" not present in any doc")
i=1
j=1
num=min(len(p1),len(p2))
num=num-1
while i<=num or j<= num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
answer.append(p1[i])
i+=1
j+=1
elif p1[i]<p2[j]:
i+=1
else: j+=1
return answer

In [8]: #question "home and july"


ans=intersect(inv_indx,'home','july')
print(ans)

[4, 0, 1, 2, 3]
[3, 0, 2, 3]
(0, 0)
(1, 2)
(2, 2)
(3, 3)
[0, 2, 3]

http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 3 of 8
IR Assignment 1 21/08/18, 10(35 PM

In [9]: def OR (w1,w2):


answer=[]
if w1 in inv_indx:
p1=inv_indx[w1]
print(p1)
else: print(w1+" not present in any doc")
if w2 in inv_indx:
p2=inv_indx[w2]
print(p2)
else: print(w2+" not present in any doc")
i=1
j=1
num=min(len(p1),len(p2))
num=num-1
while i<=num or j<= num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
answer.append(p1[i])
i+=1
j+=1
elif p1[i]<p2[j]:
answer.append(p1[i])
i+=1
else: answer.append(p2[j]);j+=1

if i<j: answer.append(p2[j:])
else: answer.append(p1[i:])
return answer

In [10]: #question"sales or forecasts


ans=OR('sales','forecasts')
print(ans)

[4, 0, 1, 2, 3]
[1, 1]
(0, 1)
(1, 1)
[0, 1, [2, 3]]

http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 4 of 8
IR Assignment 1 21/08/18, 10(35 PM

In [11]: def minus (w1,w2):


answer=[]
if w1 in inv_indx:
p1=inv_indx[w1]
print(p1)
else: print(w1+" not present in any doc")
if w2 in inv_indx:
p2=inv_indx[w2]
print(p2)
else: print(w2+" not present in any doc")
i=1
j=1
num=min(len(p1),len(p2))
num=num-1
while i<=num or j<= num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
i+=1
j+=1
elif p1[i]<p2[j]:
answer.append(p1[i])
i+=1
else: answer.append(p2[j]);j+=1

answer.append(p1[i:])
return answer

In [12]: #question "sales-july"


ans=minus('sales','july')
print(ans)

[4, 0, 1, 2, 3]
[3, 0, 2, 3]
(0, 0)
(1, 2)
(2, 2)
(3, 3)
[1, []]

http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 5 of 8
IR Assignment 1 21/08/18, 10(35 PM

In [14]: def NOT (w1):


answer=[]
if w1 in inv_indx:
p1=inv_indx[w1]
print(p1)
else: print(w1+" not present in any doc")
num=len(p1)-1
i=1
temp=doc_list
while i<= num:
if p1[i] in doc_list:
temp.remove(p1[i])
i+=1
answer=temp
return answer

In [15]: #question not july


print(NOT('july'))

[3, 0, 2, 3]
[1]

In [16]: l1=[4,6,10,12,14,16,18,20,22,32,47,81,120,122,157,180]
l2=[47]

In [17]: #AND - merge algorithm


import math
def intersect_skip(p1,p2):
answer=[]
i=0
j=0
num=max(len(p1),len(p2))
print(num)
skip=int(math.sqrt(num))
while i<=num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
answer.append(i)
i+=1
return answer
elif p1[i]<p2[j]:
i+=skip
else: i-=1
return answer

http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 6 of 8
IR Assignment 1 21/08/18, 10(35 PM

In [18]: print(intersect_skip(l1,l2))
#the number of comparisons here with skip pointers is 6
#due to the fact that lists are sorted and we can easily skip
#and adjust the pointer in accordance with the value being searched.

16
(4, 47)
(14, 47)
(22, 47)
(120, 47)
(81, 47)
(47, 47)
[10] The index of first occurence

In [19]: #AND - merge algorithm


import math
def intersect_normal(p1,p2):
answer=[]
i=0
j=0
num=max(len(p1),len(p2))
print(num)
skip=int(math.sqrt(num))
while i<=num:
print(p1[i],p2[j])
if p1[i]==p2[j]:
answer.append(i)
i+=1
return answer
elif p1[i]<p2[j]:
i+=1
else: i-=1
return answer

In [20]: print(intersect_normal(l1,l2))
#here 11 comparisons needed since we need to move in linear fashion
# no use of the soretd lists is made of

16
(4, 47)
(6, 47)
(10, 47)
(12, 47)
(14, 47)
(16, 47)
(18, 47)
(20, 47)
(22, 47)
(32, 47)
(47, 47)
[10]

http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 7 of 8
IR Assignment 1 21/08/18, 10(35 PM

In [ ]:

http://localhost:8888/notebooks/mlworkshop/Information%20Retrieval/IR%20Assignment%201.ipynb# Page 8 of 8

Potrebbero piacerti anche