-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathMostCommonWord.py
119 lines (108 loc) · 4.66 KB
/
MostCommonWord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 3 13:15:51 2016
@author: acp16mh
"""
import re, sys
from random import randint
wordSearch=re.compile("\w+")
AnswerSearch = re.compile("[(\w+)\|]+\w+")
CorrectAnswer = re.compile("\s+(\w+)\s\s+")
docwords = {}
possibleAnswers = []
words = []
count = 0
DocCount = 1
NbFrqCorrect = 0
NbRandCorrect = 0
ResultsFile = open("ChildTestResults.csv", "w") #Will fail here if you have the file open
ResultsFile.write("Document Number, Correct Answer, Most Frequent Answer, Most Frequent Correct?, Random Answer, Random Number Correct? \n")
ExportedData = str(DocCount)
with open ('TestTrain.txt', "r") as page:
for line in page:
count+=1
if count <= 20: #reads in the first 20 lines and counts the number of times each word appears
#print(count, ":", line)
words = wordSearch.findall(line)
for word in words:
if word != str(count): #The lines start with their number, this removes that
word=word.lower()
if word in docwords:
docwords[word] +=1
else:
docwords[word] = 1
elif count == 21:
print(line)
#Getting only the sentence
WordsInLine = AnswerSearch.findall(line)
QSentence = ""
for i in range(1, len(WordsInLine)-2):
print(WordsInLine[i])
QSentence +=WordsInLine[i] + " "
print(QSentence)
SenSplit = re.split("XX+", line)
#Getting the answer, which is given in the text
GivenAnswer = CorrectAnswer.findall(line)
hold = ""
for i in range(0,len(GivenAnswer)):
hold += GivenAnswer[i]
#print(hold)
#Finding the answer given in the text
GivenAnswerS = str(hold)
#print(GivenAnswerS)
ExportedData += "," + str(GivenAnswerS)
#Getting the possible answers
PossibleAnswer = AnswerSearch.findall(line)
AllPoss = PossibleAnswer[len(PossibleAnswer)-1]
AllPossAns = wordSearch.findall(AllPoss)
print(AllPossAns)
#Best way I could find of getting only the possible answers
#Most frequent method
answercount = 0
for word in AllPossAns:
if word in docwords:
#print(word, docwords[word])
if docwords[word] > answercount:
answercount = docwords[word]
FeqAnswer = word
if answercount > 0:
#print("\nThe most common possible answer is: ", FeqAnswer, "\nWith a count of: ", answercount)
ExportedData += "," + str(FeqAnswer)
if FeqAnswer == GivenAnswerS:
ExportedData += "," + "1"
NbFrqCorrect += 1
else:
ExportedData += "," + "0"
#print("\nThe full sentence is therefore: \n", SenSplit[0], answer, SenSplit[1])
else:
#print("None of the possible anwers appeared in the text")
ExportedData += "," + "NA" + "," + "NA"
#Random number method
randomWord = randint(0, len(AllPossAns)-1)
#print("Random word chosen was:", AllPossAns[randomWord])
ExportedData += "," + str(AllPossAns[randomWord])
if AllPossAns[randomWord] == GivenAnswerS:
ExportedData += "," + "1"
NbRandCorrect += 1
else:
ExportedData += "," + "0"
#Writing the data to the file
ExportedData += "\n"
ResultsFile.write(ExportedData)
#print(ExportedData)
elif count == 22: #Reads in the possible answers
print(DocCount, "documents done")
count = 0
DocCount += 1
docwords.clear()
ExportedData = str(DocCount)
else:
#It should just never reach this point
print("Houston we have a problem")
sys.end()
ResultsFile.close()
FrqProb = NbFrqCorrect/DocCount
RandProb = NbRandCorrect/DocCount
print("The most frequent word got", NbFrqCorrect,"which gives the probablity of being correct as", FrqProb)
print("The random word got", NbRandCorrect, "which gives the probablity of being correct as", RandProb)
print("Thats all folks")