Skip to content

Commit

Permalink
support Groovy 5 and calculate cluster predictions
Browse files Browse the repository at this point in the history
  • Loading branch information
paulk-asert committed May 27, 2024
1 parent 89d1bea commit d3bd299
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 29 deletions.
1 change: 1 addition & 0 deletions gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ flinkMlVersion = 2.3.0
flinkStatefunVersion = 3.2.0
groovy3Version = 3.0.21
groovy4Version = 4.0.21
groovy5Version = 5.0.0-alpha-8
igniteVersion = 2.16.0
igniteMlVersion = 2.15.0
jacksonVersion = 2.17.1
Expand Down
3 changes: 1 addition & 2 deletions subprojects/WhiskeySpark/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,10 @@ tasks.named('run').configure {
}

dependencies {
implementation "org.apache.groovy:groovy:$groovy4Version"
implementation "org.apache.groovy:groovy:$groovy5Version"
implementation "org.apache.spark:spark-mllib_$sparkVariant:$sparkVersion"
implementation "org.apache.spark:spark-sql_$sparkVariant:$sparkVersion"
implementation "com.fasterxml.jackson:jackson-bom:$jacksonVersion"
runtimeOnly "org.apache.spark:spark-core_$sparkVariant:$sparkVersion"
}

tasks.register('versionInfo') {
Expand Down
60 changes: 33 additions & 27 deletions subprojects/WhiskeySpark/src/main/groovy/WhiskeySpark.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -13,44 +13,50 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//@Grab("com.fasterxml.jackson.module:jackson-module-scala_2.11:2.11.1;transitive=false")
//@Grab('org.apache.spark:spark-sql_2.11:2.4.7')
//@Grab('org.apache.spark:spark-mllib_2.11:2.4.7')
//@GrabExclude("commons-codec:commons-codec:1.10")
//@GrabExclude("javax.xml.stream:stax-api:1.0-2")
//@Grab("commons-io:commons-io:2.10.0")

import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Row

import static org.apache.spark.sql.SparkSession.builder

static main(args) {
def spark = builder().config('spark.master', 'local[8]').appName('Whiskey').orCreate
spark.sparkContext().logLevel = 'WARN'
def file = WhiskeySpark.classLoader.getResource('whiskey.csv').file
int k = 5
Dataset<Row> rows = spark.read().format('com.databricks.spark.csv')
.options('header': 'true', 'inferSchema': 'true').load(file)
//def colNames = rows.columns().toList().minus(extras).parallelStream().toArray(String[]::new)
var spark = builder().config('spark.master', 'local[8]').appName('Whiskey').orCreate
spark.sparkContext().logLevel = 'WARN' // quieten logging once we've started
var file = WhiskeySpark.classLoader.getResource('whiskey.csv').file

var rows = spark.read().format('com.databricks.spark.csv')
.options(header: 'true', inferSchema: 'true').load(file)
String[] colNames = rows.columns().toList() - ['RowID', 'Distillery']
def assembler = new VectorAssembler(inputCols: colNames, outputCol: 'features')
Dataset<Row> dataset = assembler.transform(rows)
def clusterer = new KMeans(k: k, seed: 1L)
def model = clusterer.fit(dataset)
var assembler = new VectorAssembler(inputCols: colNames, outputCol: 'features')
var dataset = assembler.transform(rows)
var kmeans = new KMeans(k: 5, seed: 1L)
var model = kmeans.fit(dataset)
println '\nCluster centers:'
model.clusterCenters().each { println it.values().collect { sprintf '%.2f', it }.join(', ') }
println()
spark.sparkContext().logLevel = 'INFO'
var result = model.transform(dataset)
var clusters = result.toLocalIterator().collect { row ->
[row.getAs('prediction'), row.getAs('Distillery')]
}.groupBy { it[0] }.collectValues { it*.get(1) }
clusters.each { k, v -> println "Cluster$k: ${v.join(', ')}"}
println()
spark.sparkContext().logLevel = 'INFO' // logging back to normal
spark.stop()
}
/*
24/05/26 10:55:38 INFO SparkContext: Running Spark version 3.5.1
...
Cluster centers:
1.73, 2.35, 1.58, 0.81, 0.19, 1.15, 1.42, 0.81, 1.23, 1.77, 1.23, 1.31
2.00, 1.00, 3.00, 0.00, 0.00, 0.00, 3.00, 1.00, 0.00, 2.00, 2.00, 2.00
2.86, 2.38, 1.52, 0.05, 0.00, 1.95, 1.76, 2.05, 1.81, 2.05, 2.19, 1.71
1.53, 2.38, 1.06, 0.16, 0.03, 1.09, 1.00, 0.50, 1.53, 1.75, 2.13, 2.28
3.67, 1.50, 3.67, 3.33, 0.67, 0.17, 1.67, 0.50, 1.17, 1.33, 1.17, 0.17
*/
2.89, 2.42, 1.53, 0.05, 0.00, 1.84, 1.58, 2.11, 2.11, 2.11, 2.26, 1.58
1.45, 2.35, 1.06, 0.26, 0.06, 0.84, 1.13, 0.45, 1.26, 1.65, 2.19, 2.10
1.83, 3.17, 1.00, 0.33, 0.17, 1.00, 0.67, 0.83, 0.83, 1.50, 0.50, 1.50
3.00, 1.50, 3.00, 2.80, 0.50, 0.30, 1.40, 0.50, 1.50, 1.50, 1.30, 0.50
1.85, 2.20, 1.70, 0.40, 0.10, 1.85, 1.80, 1.00, 1.35, 2.00, 1.40, 1.85
Cluster0: Aberfeldy, Aberlour, Auchroisk, Balmenach, BenNevis, Benrinnes, BlairAthol, Dailuaine, Dalmore, Edradour, Glendronach, Glendullan, Glenfarclas, Glenrothes, Longmorn, Macallan, Mortlach, RoyalLochnagar, Strathisla
Cluster1: AnCnoc, Auchentoshan, Aultmore, Balblair, Benriach, Bladnoch, Bunnahabhain, Cardhu, Craigganmore, Dufftown, GlenElgin, GlenGrant, GlenKeith, GlenMoray, Glenallachie, Glenfiddich, Glengoyne, Glenkinchie, Glenlossie, Glenmorangie, Linkwood, Loch Lomond, Mannochmore, RoyalBrackla, Speyside, Strathmill, Tamdhu, Tamnavulin, Teaninich, Tobermory, Tullibardine
Cluster3: Ardbeg, Caol Ila, Clynelish, GlenScotia, Isle of Jura, Lagavulin, Laphroig, Oban, OldPulteney, Talisker
Cluster4: Ardmore, Belvenie, Benromach, Bowmore, Bruichladdich, Craigallechie, Dalwhinnie, Deanston, GlenGarioch, GlenOrd, Glenlivet, Glenturret, Highland Park, Inchgower, Knochando, OldFettercairn, Scapa, Springbank, Tomatin, Tomintoul
Cluster2: ArranIsleOf, GlenDeveronMacduff, GlenSpey, Miltonduff, Speyburn, Tomore
...
24/05/26 10:55:51 INFO SparkContext: Successfully stopped SparkContext
*/

0 comments on commit d3bd299

Please sign in to comment.