-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimple_train_test.py
103 lines (72 loc) · 2.87 KB
/
simple_train_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
from libact.base.dataset import Dataset
from os.path import isfile
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
n_topics = 100
n_features = 1000
def preprocessing(paragraphs):
import re
processed = []
for paragraph in paragraphs:
processed.append(re.sub(r'(?<![0-9])((([0-2][0-9][0-9][0-9])|([0-9][0-9][0-9])|([0-9][0-9])|([0-9])))(?![0-9])', "possibleYear", paragraph))
return processed
def get_paragraphs():
"""Get the paragraphs from a list of files"""
# I create a list for the file contents to go in
docs_non = []
docs_sens = []
non_sensitive_file = "test-non"
sensitive_file = "test-sensitive"
if (isfile(non_sensitive_file)):
with open(non_sensitive_file, 'r') as doc:
paragraphs = [paragraph for paragraph in doc.read().split('\n')]
docs_non.extend(paragraphs)
if (isfile(sensitive_file)):
with open(sensitive_file, 'r') as doc:
paragraphs = [paragraph for paragraph in doc.read().split('\n')]
docs_sens.extend(paragraphs)
docs_non = preprocessing(docs_non)
docs_sens = preprocessing(docs_sens)
return docs_non, docs_sens
def make_dataset():
# Change this!
n_classes = 2
n_labeled = 10
docs_non, docs_sen = get_paragraphs()
mixed = np.append(docs_sen, docs_non)
docs_non = np.array(docs_non)
docs_sen = np.array(docs_sen)
labels = [0] * len(mixed)
for i in range(len(docs_sen)):
labels[i] = 1
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
stop_words='english')
tf = tf_vectorizer.fit_transform(mixed)
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
lda.fit(tf)
doc_topic_matrix = lda.transform(tf)
X = doc_topic_matrix
X_train, X_test, y_train, y_test = train_test_split(X, labels, random_state=12, test_size=0.33)
return X_train, X_test, y_train, y_test
def main():
X_train, X_test, y_train, y_test = make_dataset()
logRegModel = LogisticRegression()
temp_X = np.zeros(X_train.shape)
temp_y = []
for i in range(X_train.shape[0]):
temp_X[i,:] = X_train[i,:]
temp_y.append(y_train[i])
if len(set(temp_y)) >= 2:
logRegModel.fit(temp_X[0:i+1,:], temp_y)
prediction = logRegModel.predict(X_test)
print(accuracy_score(prediction, y_test))
input("Press enter to continue...")
if __name__ == '__main__':
main()