-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclusterPages.java
161 lines (135 loc) · 5.23 KB
/
clusterPages.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
/**
*
* Description clusterPages
*Cluster Pages employs method to run Latent Dirichlet allocation and cluster documents
*@author kalyan
*@version 0.01 Dec 8,2012
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import com.aliasi.cluster.LatentDirichletAllocation;
import com.aliasi.symbol.MapSymbolTable;
import com.aliasi.symbol.SymbolTable;
import com.aliasi.symbol.SymbolTableCompiler;
import com.aliasi.tokenizer.EnglishStopTokenizerFactory;
import com.aliasi.tokenizer.LowerCaseTokenizerFactory;
import com.aliasi.tokenizer.ModifyTokenTokenizerFactory;
import com.aliasi.tokenizer.RegExTokenizerFactory;
import com.aliasi.tokenizer.StopTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;
public class clusterPages {
/**
*
* sym is the symbolTable which maps each symbol(words) to a token(number)
*/
static SymbolTable sym=new MapSymbolTable();
/**
*
* articleText will have the input corpus with each element being contents of single page
*/
static CharSequence articleText[];
/**
*
* LDASample creates doc Topic Matrix and Word Topic Matrix from the input corpus
*@param file filename of the pdf to be summarized
*@return sample object of GibbsSample which contains 2 probability matrices
*/
static LatentDirichletAllocation.GibbsSample LDASample(String file){
ArrayList<CharSequence> pdfPages=pdfExtractor.fileRead(file);
if(pdfPages!=null){
articleText=pdfPages.toArray(new CharSequence[pdfPages.size()]);
int [][]docTokens=LatentDirichletAllocation.tokenizeDocuments(articleText,WORMBASE_TOKENIZER_FACTORY,sym,1);
/*reporting handler is not necessary for us to display results*/
unnecessaryReportingHandler handler= new unnecessaryReportingHandler(sym);
short numTopics=6;
double alpha=0.1;
double beta=0.01;
int numIterations=100;
Random randomseed=new Random(35L);
LatentDirichletAllocation.GibbsSample sample = LatentDirichletAllocation.gibbsSampler(docTokens,
numTopics, alpha, beta, 0, 1, numIterations, randomseed, handler);
return sample;
}
return null;
}
/*tokenizer copied from lingpipe source*/
/**
* tokenizer copied from lingpipe demo code
* It converts all letters to smallcase, removes stop words
*/
static final TokenizerFactory wormbaseTokenizerFactory() {
TokenizerFactory factory = BASE_TOKENIZER_FACTORY;
factory = new NonAlphaStopTokenizerFactory(factory);
factory = new LowerCaseTokenizerFactory(factory);
factory = new EnglishStopTokenizerFactory(factory);
factory = new StopTokenizerFactory(factory,STOPWORD_SET);
factory = new StemTokenizerFactory(factory);
return factory;
}
/**
* returns if a word is valid stem, Copied from Lingpipe demo
*/
static boolean validStem(String stem) {
if (stem.length() < 2) return false;
for (int i = 0; i < stem.length(); ++i) {
char c = stem.charAt(i);
for (int k = 0; k < VOWELS.length; ++k)
if (c == VOWELS[k])
return true;
}
return false;
}
static final TokenizerFactory BASE_TOKENIZER_FACTORY
= new RegExTokenizerFactory("[\\x2Da-zA-Z0-9]+"); // letter or digit or hyphen (\x2D)
static final char[] VOWELS
= new char[] { 'a', 'e', 'i', 'o', 'u', 'y' };
static final String[] STOPWORD_LIST=stopWordList.stopwords;
static final Set<String> STOPWORD_SET= new HashSet<String>(Arrays.asList(STOPWORD_LIST));
static final TokenizerFactory WORMBASE_TOKENIZER_FACTORY
= wormbaseTokenizerFactory();
// removes tokens that have no letters
static class NonAlphaStopTokenizerFactory extends ModifyTokenTokenizerFactory {
static final long serialVersionUID = -3401639068551227864L;
public NonAlphaStopTokenizerFactory(TokenizerFactory factory) {
super(factory);
}
public String modifyToken(String token) {
return stop(token) ? null : token;
}
public boolean stop(String token) {
if (token.length() < 2) return true;
for (int i = 0; i < token.length(); ++i)
if (Character.isLetter(token.charAt(i)))
return false;
return true;
}
}
static class StemTokenizerFactory extends ModifyTokenTokenizerFactory {
static final long serialVersionUID = -6045422132691926248L;
public StemTokenizerFactory(TokenizerFactory factory) {
super(factory);
}
static final String[] SUFFIXES = new String[] {
"ss", "ies", "sses", "s" // s must be last as its weaker
};
public String modifyToken(String token) {
for (String suffix : SUFFIXES) {
if (token.endsWith(suffix)) {
String stem = token.substring(0,token.length()-suffix.length());
return validStem(stem) ? stem : token;
}
}
return token;
}
}
/**
* Just to test code
* @param args
*/
public static void main(String args[]){
LatentDirichletAllocation.GibbsSample s=LDASample("/home/kalyan/Downloads/ror.pdf");
}
}