-
Notifications
You must be signed in to change notification settings - Fork 474
/
Copy pathdataframe-session-2020-11-04.txt
149 lines (137 loc) · 3.71 KB
/
dataframe-session-2020-11-04.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
$ cat /tmp/emps_no_header.txt
1001,alex,67000,SALES
1002,bob,24000,SALES
1003,boby,24000,SALES
1004,jane,69000,SOFTWARE
1005,betty,55000,SOFTWARE
1006,jeff,59000,SOFTWARE
1007,dara,72000,SOFTWARE
1001,al,69000,SALES
1002,bobby,24900,BUSINESS
$ ./bin/pyspark
Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
[Clang 6.0 (clang-600.0.57)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 3.0.0
/_/
Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
SparkSession available as 'spark'.
>>> input_path = '/tmp/emps_no_header.txt'
>>> df = spark.read.csv(input_path)
>>> df.show()
+----+-----+-----+--------+
| _c0| _c1| _c2| _c3|
+----+-----+-----+--------+
|1001| alex|67000| SALES|
|1002| bob|24000| SALES|
|1003| boby|24000| SALES|
|1004| jane|69000|SOFTWARE|
|1005|betty|55000|SOFTWARE|
|1006| jeff|59000|SOFTWARE|
|1007| dara|72000|SOFTWARE|
|1001| al|69000| SALES|
|1002|bobby|24900|BUSINESS|
+----+-----+-----+--------+
>>> df.count()
9
>>> df.printSchema()
root
|-- _c0: string (nullable = true)
|-- _c1: string (nullable = true)
|-- _c2: string (nullable = true)
|-- _c3: string (nullable = true)
>>> df2 = df.selectExpr("_c0 as id", "_c1 as name", "_c2 as salary", "_c3 as dept")
>>> df2.show()
+----+-----+------+--------+
| id| name|salary| dept|
+----+-----+------+--------+
|1001| alex| 67000| SALES|
|1002| bob| 24000| SALES|
|1003| boby| 24000| SALES|
|1004| jane| 69000|SOFTWARE|
|1005|betty| 55000|SOFTWARE|
|1006| jeff| 59000|SOFTWARE|
|1007| dara| 72000|SOFTWARE|
|1001| al| 69000| SALES|
|1002|bobby| 24900|BUSINESS|
+----+-----+------+--------+
>>> df2.createOrReplaceTempView("emp_table")
>>> df3 = spark.sql("SELECT * FROM emp_table WHERE id > 1002")
>>> df3.show()
+----+-----+------+--------+
| id| name|salary| dept|
+----+-----+------+--------+
|1003| boby| 24000| SALES|
|1004| jane| 69000|SOFTWARE|
|1005|betty| 55000|SOFTWARE|
|1006| jeff| 59000|SOFTWARE|
|1007| dara| 72000|SOFTWARE|
+----+-----+------+--------+
>>> df3.printSchema()
root
|-- id: string (nullable = true)
|-- name: string (nullable = true)
|-- salary: string (nullable = true)
|-- dept: string (nullable = true)
>>> df4 = df2.filter(df2.id > 1002)
>>> df4.show()
+----+-----+------+--------+
| id| name|salary| dept|
+----+-----+------+--------+
|1003| boby| 24000| SALES|
|1004| jane| 69000|SOFTWARE|
|1005|betty| 55000|SOFTWARE|
|1006| jeff| 59000|SOFTWARE|
|1007| dara| 72000|SOFTWARE|
+----+-----+------+--------+
>>> df5 = spark.sql("SELECT id, salary FROM emp_table WHERE id > 1002")
>>> df5.show()
+----+------+
| id|salary|
+----+------+
|1003| 24000|
|1004| 69000|
|1005| 55000|
|1006| 59000|
|1007| 72000|
+----+------+
>>>
>>> df6 = spark.sql("SELECT name, salary FROM emp_table WHERE salary > 55000 ORDER BY salary")
>>> df6.show()
+----+------+
|name|salary|
+----+------+
|jeff| 59000|
|alex| 67000|
|jane| 69000|
| al| 69000|
|dara| 72000|
+----+------+
>>> df6 = spark.sql("SELECT name, salary FROM emp_table WHERE salary > 55000 ORDER BY salary DESC")
>>> df6.show()
+----+------+
|name|salary|
+----+------+
|dara| 72000|
| al| 69000|
|jane| 69000|
|alex| 67000|
|jeff| 59000|
+----+------+
>>> df7 = spark.sql("SELECT dept, COUNT(*) as count FROM emp_table GROUP BY dept")
>>> df7.show()
+--------+-----+
| dept|count|
+--------+-----+
| SALES| 4|
|BUSINESS| 1|
|SOFTWARE| 4|
+--------+-----+
>>>