tutorial/pyspark-examples/dataframes/dataframe-session-2021-05-12-intro.txt

#--------------------
# DataFrame Tutorial:
#--------------------
   https://dzone.com/articles/pyspark-dataframe-tutorial-introduction-to-datafra


#---------------------
# Demo of DataFrames
#---------------------

$ cat /tmp/cats.csv
name,age,gender,weight
cuttie,2,female,6
mono,3,male,9
pishi,2,female,4
zazo,1,male,4
fuzzy,1,female,4

$ ./bin/pyspark
Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.1.1
      /_/

Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
Spark context Web UI available at http://10.0.0.93:4040
Spark context available as 'sc' (master = local[*], app id = local-1620755686906).
SparkSession available as 'spark'.

>>>
>>> input_path = '/tmp/cats.csv'
>>> input_path
'/tmp/cats.csv'
>>> cats = spark.read.csv(input_path, inferSchema = True, header = True)

>>>
>>> cats.show(truncate=False)
+------+---+------+------+
|name  |age|gender|weight|
+------+---+------+------+
|cuttie|2  |female|6     |
|mono  |3  |male  |9     |
|pishi |2  |female|4     |
|zazo  |1  |male  |4     |
|fuzzy |1  |female|4     |
+------+---+------+------+

>>> cats.printSchema()
root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- weight: integer (nullable = true)

>>> cats.count()
5
>>> cats.columns
['name', 'age', 'gender', 'weight']
>>> cats.describe('weight').show()
+-------+------------------+
|summary|            weight|
+-------+------------------+
|  count|                 5|
|   mean|               5.4|
| stddev|2.1908902300206643|
|    min|                 4|
|    max|                 9|
+-------+------------------+

>>> name_age = cats.select("name", "age")
>>> name_age.show(truncate=False)
+------+---+
|name  |age|
+------+---+
|cuttie|2  |
|mono  |3  |
|pishi |2  |
|zazo  |1  |
|fuzzy |1  |
+------+---+

>>> name_age.printSchema()
root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

>>> cats.select('age').distinct().show()
+---+
|age|
+---+
|  1|
|  3|
|  2|
+---+

>>> cats.select('name', 'age').distinct().show()
+------+---+
|  name|age|
+------+---+
|  zazo|  1|
|cuttie|  2|
| fuzzy|  1|
|  mono|  3|
| pishi|  2|
+------+---+

>>> cats.filter(cats.age > 1).show()
+------+---+------+------+
|  name|age|gender|weight|
+------+---+------+------+
|cuttie|  2|female|     6|
|  mono|  3|  male|     9|
| pishi|  2|female|     4|
+------+---+------+------+


>>> cats.orderBy(cats.age).show()
+------+---+------+------+
|  name|age|gender|weight|
+------+---+------+------+
|  zazo|  1|  male|     4|
| fuzzy|  1|female|     4|
|cuttie|  2|female|     6|
| pishi|  2|female|     4|
|  mono|  3|  male|     9|
+------+---+------+------+

>>> age_df = cats.groupby("age").count()
>>> age_df.show()
+---+-----+
|age|count|
+---+-----+
|  1|    2|
|  3|    1|
|  2|    2|
+---+-----+

>>> cats.show()
+------+---+------+------+
|  name|age|gender|weight|
+------+---+------+------+
|cuttie|  2|female|     6|
|  mono|  3|  male|     9|
| pishi|  2|female|     4|
|  zazo|  1|  male|     4|
| fuzzy|  1|female|     4|
+------+---+------+------+

>>> cats.registerTempTable('cats_table')
>>> spark.sql("select * from cats_table").show()
+------+---+------+------+
|  name|age|gender|weight|
+------+---+------+------+
|cuttie|  2|female|     6|
|  mono|  3|  male|     9|
| pishi|  2|female|     4|
|  zazo|  1|  male|     4|
| fuzzy|  1|female|     4|
+------+---+------+------+

>>> spark.sql("select * from cats_table where age > 1").show()
+------+---+------+------+
|  name|age|gender|weight|
+------+---+------+------+
|cuttie|  2|female|     6|
|  mono|  3|  male|     9|
| pishi|  2|female|     4|
+------+---+------+------+

>>> spark.sql("select  age, count(*) from cats_table group by age").show()
+---+--------+
|age|count(1)|
+---+--------+
|  1|       2|
|  3|       1|
|  2|       2|
+---+--------+

>>> def exec_sql(query):
...   spark.sql(query).show()
...
>>>
>>> exec_sql("select  age, count(*) from cats_table group by age")
+---+--------+
|age|count(1)|
+---+--------+
|  1|       2|
|  3|       1|
|  2|       2|
+---+--------+