-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathconfig_template.properties
160 lines (132 loc) · 4.25 KB
/
config_template.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
##### GENERAL CONFIGURATIONS
# location of elasticsearch
elasticsearchLoc=[replace this with the locaiton of your elasticsearch]
# For example: elasticsearchLoc=/my/location/of/siamese/elasticsearch-2.2.0
# elasticsearch's server name (or IP)
server=localhost
# elasticsearch's cluster name. See cluster.name in your $elasticsearchLoc/config/elasticsearch.yml
cluster=stackoverflow
# index name
index=[replace this with a preferred index name]
# For example: index=test
# type name
type=siamese
# location of the input folder. This is the location of the files to be indexed (if command=index),
# or the location of the queries (if command=search).
inputFolder=[replace this with the location of the folder to use as clone search queries]
# For example: inputFolder=/my/location/of/cloplag/tests
# only for GitHub indexing, leave blank if not needed
subInputFolder=
# output folder to store the search results
outputFolder=[replace this with the location to store the search output]
# For example: outputFolder=search_results
# use DFS mode [default=no]
dfs=true
writeToFile=true
# source code file extension
extension=java
# minimum clone size (lines)
minCloneSize=6
# command to execute [index,search]
command=index
# print out logging data
isPrint=false
# output format [csv = filename, csvfline = filename#start#end), gcf = general clone format]
outputFormat=csvfline
# indexing mode [sequential, bulk]
indexingMode=bulk
# size of bulk insert
bulkSize=4000
# clone granularity [method, file]
parseMode=method
# print the progress of indexing/querying in every x files
printEvery=10000
# recreate the index if it exists [true, false]
recreateIndexIfExists=true
##### DELETE SETTINGS
deleteField=
deleteWildcard=
deleteAmount=1000
##### PARSER + TOKENIZER + NORMALIZER SETTINGS
methodParser=crest.siamese.language.java.JavaMethodParser
tokenizer=crest.siamese.language.java.JavaTokenizer
normalizer=crest.siamese.language.java.JavaNormalizer
##### MULTI-REPRESENTATION SETTINGS
multirep=true
enableRep=true,true,true,true
##### NORMALIZATION MODE
##### Code normalisation for T2 and T3 representation.
### Java
# Combination of x (none), w (words), d (datatypes), j (Java classes), p (Java packages),
# k (keywords), v (values), s (strings), o (operators), e (escape).
# T2 norm mode: dsvw
# T3 norm mode: djkopsvw
### Python 3
# Combination of k (keywords), v (values), s (strings), o (operators), w (words)
# T2 norm mode: vsw
# T3 norm mode: kvsow
normalizerMode=crest.siamese.language.java.JavaNormalizerMode
t2NormMode=dsvw
t3NormMode=djkopsvw
# turn on ngram
isNgram=true
# size of ngram.
# representation T3
ngramSize=4
# representation T2
t2NgramSize=4
# representation T1
t1NgramSize=4
##### QUERY-RELATED SETTINGS
# starting result offset (usually zero)
resultOffset=0
# the size of the results
resultsSize=100
# tfidf, bm25, dfr, ib, lmd (LM Dirichlet), lmj (LM Jelinek-Mercer)
rankingFunction=tfidf
# QUERY REDUCTION SETTINGS
# turn on query reduction [true/false]
queryReduction=true
# reduction percentile for the T3 layer [0, 100]
QRPercentileNorm=10
# reduction percentile for the T2 layer [0, 100]
QRPercentileT2=10
# reduction percentile for the T1 layer [0, 100]
QRPercentileT1=10
# reduction percentile for the T1 layer [0, 100]
QRPercentileOrig=10
# boosting for T3 layer
normBoost=4
# boosting for T2 layer
t2Boost=4
# boosting for T1 layer
t1Boost=4
# boosting for T0 layer
origBoost=1
# ignore the query clones
ignoreQueryClones=true
##### LICENSE EXTRACTION
# extract license [true, false]
license=false
# license extractor [ninka, regexp]
licenseExtractor=regexp
##### EXPERIMENT CONFIGURATIONS
# ONLY USED FOR THE EXPERIMENTS OF SIAMESE
# elasticsearch similarity function + ngram + normalisation [or both]
similarityMode=tfidf_text_both
# prefix of the clone cluster file name [cloplag/soco]
cloneClusterFile=soco
# IR error measure [arp/map]
errorMeasure=map
# delete the index after every run?
deleteIndexAfterUse=true
##### SIMILARITY
# compute similarity of the results [fuzzywuzzy, tokenratio, none]
computeSimilarity=tokenratio
# the similarity threshold for the four representations [T1,T2,T3,T4] respectively
simThreshold=50%,60%,70%,80%
# GitHub indexing? (automatically add URL)
github=false
##### DEPRECATED
# (DEPRECATED) no. of total documents in the index
totalDocuments=100