Transform Spark

rm -r dp-203 -f

git clone https://github.com/MicrosoftLearning/dp-203-azure-data-engineer dp-203

cd dp-203/Allfiles/labs/06

./setup.ps1

https://github.com/MicrosoftLearning/dp-203-azure-data-engineer/tree/master/Allfiles/labs/06/notebooks

order_details = spark.read.csv('/data/*.csv', header=True, inferSchema=True)
display(order_details.limit(5))

 

from pyspark.sql.functions import split, col

# Create the new FirstName and LastName fields
transformed_df = order_details.withColumn("FirstName", split(col("CustomerName"), " ").getItem(0)).withColumn("LastName", split(col("CustomerName"), " ").getItem(1))

# Remove the CustomerName field
transformed_df = transformed_df.drop("CustomerName")

display(transformed_df.limit(5))

transformed_df.write.mode("overwrite").parquet('/transformed_data/orders.parquet')
print ("Transformed data saved!")

from pyspark.sql.functions import year, month, col

dated_df = transformed_df.withColumn("Year", year(col("OrderDate"))).withColumn("Month", month(col("OrderDate")))
display(dated_df.limit(5))
dated_df.write.partitionBy("Year","Month").mode("overwrite").parquet("/partitioned_data")
print ("Transformed data saved!")

orders_2020 = spark.read.parquet('/partitioned_data/Year=2020/Month=*')
display(orders_2020.limit(5))

order_details.write.saveAsTable('sales_orders', format='parquet', mode='overwrite', path='/sales_orders_table')

sql_transform = spark.sql("SELECT *, YEAR(OrderDate) AS Year, MONTH(OrderDate) AS Month FROM sales_orders")
display(sql_transform.limit(5))
sql_transform.write.partitionBy("Year","Month").saveAsTable('transformed_orders', format='parquet', mode='overwrite', path='/transformed_orders_table')

%%sql

SELECT * FROM transformed_orders
WHERE Year = 2021
    AND Month = 1

%%sql

DROP TABLE transformed_orders;
DROP TABLE sales_orders;

相关推荐

最近更新

  1. docker php8.1+nginx base 镜像 dockerfile 配置

    2024-07-09 19:32:04       67 阅读
  2. Could not load dynamic library ‘cudart64_100.dll‘

    2024-07-09 19:32:04       71 阅读
  3. 在Django里面运行非项目文件

    2024-07-09 19:32:04       58 阅读
  4. Python语言-面向对象

    2024-07-09 19:32:04       69 阅读

热门阅读

  1. kafka--发布-订阅消息系统

    2024-07-09 19:32:04       27 阅读
  2. 3160. 所有球里面不同颜色的数目

    2024-07-09 19:32:04       27 阅读
  3. go语言hassuffix的简单使用

    2024-07-09 19:32:04       31 阅读
  4. Vim常用整理快捷键

    2024-07-09 19:32:04       24 阅读
  5. Elasticsearch 分析器(Analyzer)的作用和配置

    2024-07-09 19:32:04       20 阅读
  6. html5 video去除边框

    2024-07-09 19:32:04       17 阅读
  7. 机器学习模型运用在机器人上

    2024-07-09 19:32:04       23 阅读
  8. 在网站存在漏洞的情况下强化安全防御

    2024-07-09 19:32:04       23 阅读