config_template.properties

##### GENERAL CONFIGURATIONS
# location of elasticsearch
elasticsearchLoc=[replace this with the locaiton of your elasticsearch]
# For example: elasticsearchLoc=/my/location/of/siamese/elasticsearch-2.2.0

# elasticsearch's server name (or IP)
server=localhost

# elasticsearch's cluster name. See cluster.name in your $elasticsearchLoc/config/elasticsearch.yml
cluster=stackoverflow

# index name
index=[replace this with a preferred index name]
# For example: index=test

# type name
type=siamese

# location of the input folder. This is the location of the files to be indexed (if command=index),
# or the location of the queries (if command=search).
inputFolder=[replace this with the location of the folder to use as clone search queries]
# For example: inputFolder=/my/location/of/cloplag/tests

# only for GitHub indexing, leave blank if not needed
subInputFolder=

# output folder to store the search results
outputFolder=[replace this with the location to store the search output]
# For example: outputFolder=search_results

# use DFS mode [default=no]
dfs=true
writeToFile=true

# source code file extension
extension=java

# minimum clone size (lines)
minCloneSize=6

# command to execute [index,search]
command=index

# print out logging data
isPrint=false

# output format [csv = filename, csvfline = filename#start#end), gcf = general clone format]
outputFormat=csvfline

# indexing mode [sequential, bulk]
indexingMode=bulk

# size of bulk insert
bulkSize=4000

# clone granularity [method, file]
parseMode=method

# print the progress of indexing/querying in every x files
printEvery=10000

# recreate the index if it exists [true, false]
recreateIndexIfExists=true

##### DELETE SETTINGS
deleteField=
deleteWildcard=
deleteAmount=1000

##### PARSER + TOKENIZER + NORMALIZER SETTINGS
methodParser=crest.siamese.language.java.JavaMethodParser
tokenizer=crest.siamese.language.java.JavaTokenizer
normalizer=crest.siamese.language.java.JavaNormalizer

##### MULTI-REPRESENTATION SETTINGS
multirep=true
enableRep=true,true,true,true

##### NORMALIZATION MODE
##### Code normalisation for T2 and T3 representation.
### Java
# Combination of x (none), w (words), d (datatypes), j (Java classes), p (Java packages),
# k (keywords), v (values), s (strings), o (operators), e (escape).
# T2 norm mode: dsvw
# T3 norm mode: djkopsvw
### Python 3
# Combination of k (keywords), v (values), s (strings), o (operators), w (words)
# T2 norm mode: vsw
# T3 norm mode: kvsow
normalizerMode=crest.siamese.language.java.JavaNormalizerMode
t2NormMode=dsvw
t3NormMode=djkopsvw

# turn on ngram
isNgram=true
# size of ngram.
# representation T3
ngramSize=4
# representation T2
t2NgramSize=4
# representation T1
t1NgramSize=4

##### QUERY-RELATED SETTINGS
# starting result offset (usually zero)
resultOffset=0
# the size of the results
resultsSize=100
# tfidf, bm25, dfr, ib, lmd (LM Dirichlet), lmj (LM Jelinek-Mercer)
rankingFunction=tfidf
# QUERY REDUCTION SETTINGS
# turn on query reduction [true/false]
queryReduction=true
# reduction percentile for the T3 layer [0, 100]
QRPercentileNorm=10
# reduction percentile for the T2 layer [0, 100]
QRPercentileT2=10
# reduction percentile for the T1 layer [0, 100]
QRPercentileT1=10
# reduction percentile for the T1 layer [0, 100]
QRPercentileOrig=10
# boosting for T3 layer
normBoost=4
# boosting for T2 layer
t2Boost=4
# boosting for T1 layer
t1Boost=4
# boosting for T0 layer
origBoost=1
# ignore the query clones
ignoreQueryClones=true

##### LICENSE EXTRACTION
# extract license [true, false]
license=false
# license extractor [ninka, regexp]
licenseExtractor=regexp

##### EXPERIMENT CONFIGURATIONS
# ONLY USED FOR THE EXPERIMENTS OF SIAMESE
# elasticsearch similarity function + ngram + normalisation [or both]
similarityMode=tfidf_text_both
# prefix of the clone cluster file name [cloplag/soco]
cloneClusterFile=soco
# IR error measure [arp/map]
errorMeasure=map
# delete the index after every run?
deleteIndexAfterUse=true

##### SIMILARITY
# compute similarity of the results [fuzzywuzzy, tokenratio, none]
computeSimilarity=tokenratio
# the similarity threshold for the four representations [T1,T2,T3,T4] respectively
simThreshold=50%,60%,70%,80%
# GitHub indexing? (automatically add URL)
github=false

##### DEPRECATED
# (DEPRECATED) no. of total documents in the index
totalDocuments=100