-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassification.py
59 lines (45 loc) · 2.65 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import preProcessing
import setProbabilities
import review
import features
import re
import math
# Driver data
# Carries out all classification and preprocessing, calculates conditional probability tables for the training data and testing phase for the training and
# testing sets.
# Carry out preprocessing and training phase on the training data
trainingFeatureSets, trainVocabSize, testFeatureSets = preProcessing.allPreProcess()
trainingSetProbabilities = setProbabilities.setProbs(trainingFeatureSets, trainVocabSize)
# Carries out classification of an inputted feature set
def getTrainingResults(featureSets, vocabSize, trainingSetProbabilities):
numCorrectPredictions = 0 # Stores the number of accurate predictions made
for i in featureSets: # Iterate through every review
# Store the predicted probabilities for the class label
probCLTrue = trainingSetProbabilities.cLTrueProb
probCLFalse = trainingSetProbabilities.cLFalseProb
for j in range(vocabSize): # Iterate trough every vocab word
# If the word exists in the review, then add its probability that it is true from training data given the predicted CL
if (i.vector[j] == 1):
probCLTrue = probCLTrue + trainingSetProbabilities.featureProbArray[j].PTrueGivenTrue
probCLFalse = probCLFalse + trainingSetProbabilities.featureProbArray[j].PTrueGivenFalse
# Else, add its probability that it is false from training data given the predicted CL
else:
probCLTrue = probCLTrue + trainingSetProbabilities.featureProbArray[j].PFalseGivenTrue
probCLFalse = probCLFalse + trainingSetProbabilities.featureProbArray[j].PFalseGivenFalse
# Check the predicted class label. If predicted to be negative:
if (probCLFalse > probCLTrue):
if (i.vector[vocabSize] == 0): # Check if the actual class label is negative too
numCorrectPredictions += 1
else:
if (i.vector[vocabSize] == 1):
numCorrectPredictions += 1
return (float(numCorrectPredictions) / float(len(featureSets))) # Return the percent of correctly guessed reviews
trainTrainPredictions = getTrainingResults(trainingFeatureSets, trainVocabSize, trainingSetProbabilities)
trainTestPredictions = getTrainingResults(testFeatureSets, trainVocabSize, trainingSetProbabilities)
# Print results to results.txt
f = open("results.txt", "w")
f.write('Accuracy for trainingSet in training and testing: ')
f.write(str(trainTrainPredictions))
f.write("\n")
f.write('Accuracy for trainingSet in training and testSet in testing: ')
f.write(str(trainTestPredictions))