from pyspark.sql import SparkSession
from pyspark.sql import StructType, StructField, IntegerType,StringType
spark = SparkSession.builder.appName('test').getOrCreate()
1、 從列表中創建DataFrame
data = [(1,"alice"),(2,'Blob'),(3,'Charlie')]
columns = ["id","name"]
df = spark.createDataFrame(data,schema=columns)
df.show()
2、通過字典列表創建
?
data1 = [{'name':'Alice','age':25},{'name':'Bob','age':30}]df1 = spark.createDataFrame(data1)
df1.show()
3、從文件中讀取
?
df2 =spark.read.csv("911.csv",header=True,inferSchema=True)
df2.show(5)
4、通過精確定義模式創建
?
schema = StructType([StructField("id",IntegerType, nullable=False),StructField("name", StringType, nullable=False),StructField("age",IntegerType, nullable=False)
])
data3 = [(1,"alice",28),(2,'Blob',33),(3,'Charlie',26)]
df3 = spark.createDataFrame(data3,schema=schema)
df3.show()
5、通過pandas 創建
?
import pandas as pdpandas_df = pd.DataFrame(data = {'name':['alice','bob'],'age':[23,24]}
)
pandas_df.head()df4 = spark.createDataFrame(pandas_df)
df4.show()
6、讀取json? 每行都是json
df5 = spark.read.json('info.json')
df5.show()