-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathwindow function
More file actions
69 lines (45 loc) · 1.49 KB
/
window function
File metadata and controls
69 lines (45 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Databricks notebook source
# DBTITLE 1,Window Partitioning in Pyspark
rdd=sc.parallelize(range(4))
# COMMAND ----------
# DBTITLE 1,Getting the number of partitions being created
rdd.getNumPartitions()
# COMMAND ----------
data = \
[("Thin", "Cell Phone", 6000),
("Normal", "Tablet", 1500),
("Mini", "Tablet", 5500),
("Ultra thin", "Cell Phone", 5500),
("Very thin", "Cell Phone", 6000),
("Big", "Tablet", 2500),
("Bendable", "Cell Phone", 3000),
("Foldable", "Cell Phone", 3000),
("Pro", "Tablet", 4500),
("Pro2", "Tablet", 6500)]
df = sqlContext.createDataFrame(data, ["product", "category", "revenue"])
df.registerTempTable("productRevenue")
# COMMAND ----------
display(df)
# COMMAND ----------
# DBTITLE 1,creating windows
from pyspark.sql.window import Window
# COMMAND ----------
# DBTITLE 1,Creating the partition by category
a = Window.partitionBy(df['category'])
#a.rowsBetween(2500,6000)
# COMMAND ----------
# DBTITLE 1,creating the table
dataFrame = df.createOrReplaceTempView('atble')
# COMMAND ----------
# MAGIC %sql select * from atble
# COMMAND ----------
dataframe = sqlContext.table('atble')
dataframe.show()
# COMMAND ----------
# DBTITLE 1,retrieving data according to the the partition data
import pyspark.sql.functions as func
#dataframe.select(df['revenue']).over(a)
diff = (func.max(dataframe['revenue']).over(a) ) #- df['revenue'])
# COMMAND ----------
# DBTITLE 1,selecting only the desired data
dataframe.select(diff.alias("partitiion data")).show()