import pandas as pddef calculate_goods_covariance():# 定義商品銷售數據字典goods_sales_data = {"時期": ["一期", "二期", "三期", "四期"],"蘋果": [15, 16, 3, 2],"橘子": [12, 14, 16, 18],"石榴": [11, 8, 7, 1]}# 將字典轉換為DataFrame對象goods_dataframe = pd.DataFrame(goods_sales_data)# 選取數值列進行協方差計算numerical_columns = goods_dataframe.select_dtypes(include=['number'])# 計算商品銷售數據的協方差矩陣covariance_matrix = numerical_columns.cov()return covariance_matrix# 調用函數計算協方差矩陣
result = calculate_goods_covariance()
# 打印協方差矩陣
print(result)
蘋果 橘子 石榴
蘋果 56.666667 -17.333333 24.333333
橘子 -17.333333 6.666667 -10.333333
石榴 24.333333 -10.333333 17.583333
import pandas as pd# 示例數據
data = {'x': [1, 2, 3, 4, 5],'y': [5, 4, 3, 2, 1]
}
df = pd.DataFrame(data)
# 計算協方差
result = df['x'].cov(df['y'])
print("使用 Pandas 計算的協方差:", result)# 本人計算過程手動推導
# x的平均值等于 (1 + 2 + 3 + 4 + 5 ) / 5 = 3
# y的平均值等于 (5 + 4 + 3 + 2 + 1 ) / 5 = 3
# w =1-3)* (5-3) + (2-3) * (4-3) + (3-3) * (3-3) + (4-3) * (2-3) + (5-3) * (1-3)
# w/(n-1) n = 5
# ((1-3)* (5-3) + (2-3) * (4-3) + (3-3) * (3-3) + (4-3) * (2-3) + (5-3) * (1-3) ) /4
# => -2.5
樣本協方差公式:
C o v ( X , Y ) = 1 n ? 1 ∑ i = 1 n ( x i ? x ˉ ) ( y i ? y ˉ ) Cov(X,Y) = \frac{1}{n-1} \sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y}) Cov(X,Y)=n?11?i=1∑n?(xi??xˉ)(yi??yˉ?)
總體協方差公式:
C o v ( X , Y ) = 1 n ∑ i = 1 n ( x i ? μ x ) ( y i ? μ y ) Cov(X,Y) = \frac{1}{n} \sum_{i=1}^{n} (x_i - \mu_x)(y_i - \mu_y) Cov(X,Y)=n1?i=1∑n?(xi??μx?)(yi??μy?)
樣本協方差公式
樣本協方差是衡量兩個變量之間相關性的統計量,其公式為:
s X Y = 1 n ? 1 ∑ i = 1 n ( X i ? X ˉ ) ( Y i ? Y ˉ ) s_{XY} = \frac{1}{n-1} \sum_{i=1}^{n} (X_i - \bar{X})(Y_i - \bar{Y}) sXY?=n?11?i=1∑n?(Xi??Xˉ)(Yi??Yˉ)
其中:
- (X_i) 和 (Y_i) 是樣本數據點。
- (\bar{X}) 和 (\bar{Y}) 是樣本 (X) 和 (Y) 的平均值。
- (n) 是樣本數據點的數量。
- (s_{XY}) 是樣本協方差。
樣本協方差的意義:
- 正協方差表示兩個變量傾向于同方向變動。
- 負協方差表示兩個變量傾向于反方向變動。
- 協方差接近零表示兩個變量之間無線性相關關系。
from pandas import DataFrame
import numpy as np
paints={"時期":["一期","二期","三期","四期"],"蘋果":[15,16,3,2],"橘子":[12,14,16,18],"石榴":[11,8,7,1]}
goods_in=DataFrame(paints)
numerical_columns = goods_in.select_dtypes(include=['number'])
goods_sum=numerical_columns.corr()
print(goods_sum)
蘋果 橘子 石榴
蘋果 1.000000 -0.891793 0.770881
橘子 -0.891793 1.000000 -0.954411
石榴 0.770881 -0.954411 1.000000
from pandas import DataFrame
paints={"字畫名稱":["旭日東升","富水長流","招財進寶","鴻運當頭"],"字畫底價":[2860,498,1068,598],"字畫拍賣加價":[1000,2000,500,1500]}
goods_in=DataFrame(paints,index=["第一幅","第二幅","第三幅","第四幅"])
goods_in.to_csv("paint.csv")
from pandas import DataFrame
import numpy as np
paints={"電話號碼":["138xxxx1111","189xxxx1111","139xxxx1111","130xxxx1111","131xxxx1111"]}
goods_in=DataFrame(paints)
# 從電話號碼中提取了前3位作為運營商前綴這個維度的數據。
goods_in["運營商前綴"]=goods_in["電話號碼"].str.slice(0,3)
print(goods_in)
電話號碼 運營商前綴
0 138xxxx1111 138
1 189xxxx1111 189
2 139xxxx1111 139
3 130xxxx1111 130
4 131xxxx1111 131
import pandas as pd
from pandas import DataFrame
login={"會員Id":[110,111,112,113],"會員名稱":["劉一","趙二","薛三","陸四"],"會員密碼":["admin","123456","000000","888888"]}
info={"會員Id":[110,111,112,113],"會員地址":["北京朝陽","北京豐臺","北京大興","河北廊坊"],"會員會費":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,on="會員Id")
print(member)# 代碼中定義了會員的登錄login_member的DataFrame,又定義了會
# 員信息member_info的DataFrame。這兩個DataFrame通過“會員Id”實
# 現了一一對應的關系,通過merge()方法把兩個DataFrame數據結構合
# 并,在合并時指明參數on的值是以“會員Id”這個維度作為參考的
會員Id 會員名稱 會員密碼 會員地址 會員會費
0 110 劉一 admin 北京朝陽 250
1 111 趙二 123456 北京豐臺 360
2 112 薛三 000000 北京大興 470
3 113 陸四 888888 河北廊坊 550
import pandas as pd
from pandas import DataFrame
login={"會員Number號碼":[110,111,112,113],"會員名稱":["劉一","趙二","薛三","陸四"],"會員密碼":["admin","123456","000000","888888"]}
info={"會員Card":[110,111,112,113],"會員地址":["北京朝陽","北京豐臺","北京大興","河北廊坊"],"會員會費":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,left_on="會員Number號碼",right_on="會員Card")
print(member)
會員Number號碼 會員名稱 會員密碼 會員Card 會員地址 會員會費
0 110 劉一 admin 110 北京朝陽 250
1 111 趙二 123456 111 北京豐臺 360
2 112 薛三 000000 112 北京大興 470
3 113 陸四 888888 113 河北廊坊 550
import pandas as pd
from pandas import DataFrame
login={"會員Number號碼":[110,111,112,114],"會員名稱":["劉一","趙二","薛三","陸四"],"會員密碼":["admin","123456","000000","888888"]}
info={"會員Card":[110,111,112,113],"會員地址":["北京朝陽","北京豐臺","北京大興","河北廊坊"],"會員會費":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,left_on="會員Number號碼",right_on="會員""Card")
print(member)
會員Number號碼 會員名稱 會員密碼 會員Card 會員地址 會員會費
0 110 劉一 admin 110 北京朝陽 250
1 111 趙二 123456 111 北京豐臺 360
2 112 薛三 000000 112 北京大興 470
Cov ( X , Y ) = 1 n ? 1 ∑ i = 1 n ( X i ? X ˉ ) ( Y i ? Y ˉ ) \text{Cov}(X, Y) = \frac{1}{n-1} \sum_{i=1}^{n} (X_i - \bar{X})(Y_i - \bar{Y}) Cov(X,Y)=n?11?i=1∑n?(Xi??Xˉ)(Yi??Yˉ)
import pandas as pd
from pandas import DataFrame
login={"會員Number號碼":[110,111,112,114],"會員名稱":["劉一","趙二","薛三","陸四"],"會員密碼":["admin","123456","000000","888888"]}
info={"會員Card":[110,111,112,113],"會員地址":["北京朝陽","北京豐臺","北京大興","河北廊坊"],"會員會費":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,left_on="會員Number號碼",right_on="會員Card",how="outer")
print(member)
會員Number號碼 會員名稱 會員密碼 會員Card 會員地址 會員會費
0 110.0 劉一 admin 110.0 北京朝陽 250.0
1 111.0 趙二 123456 111.0 北京豐臺 360.0
2 112.0 薛三 000000 112.0 北京大興 470.0
3 NaN NaN NaN 113.0 河北廊坊 550.0
4 114.0 陸四 888888 NaN NaN NaN
import pandas as pd
from pandas import DataFrame
login={"會員名稱":["劉一","趙二","薛三","陸四"],"會員密碼":["admin","123456","000000","888888"]}
info={"會員地址":["北京朝陽","北京豐臺","北京大興","河北廊坊"],"會員會費":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,4])
member_info=DataFrame(info,index=[1,2,3,4])
member=pd.merge(login_member,member_info,left_index=True,right_index=True)
print(member)
會員名稱 會員密碼 會員地址 會員會費
1 劉一 admin 北京朝陽 250
2 趙二 123456 北京豐臺 360
3 薛三 000000 北京大興 470
4 陸四 888888 河北廊坊 550
import pandas as pd
from pandas import DataFrame
login={"會員名稱":["劉一","趙二","薛三","陸四"],"會員密碼":["admin","123456","000000","888888"]}
info={"會員地址":["北京朝陽","北京豐臺","北京大興","河北廊坊"],"會員會費":[250,360,470,550]}
login_member=DataFrame(login,index=[1,2,3,5])
member_info=DataFrame(info,index=[1,2,3,4])
member=login_member.join(member_info)
print(member)
會員名稱 會員密碼 會員地址 會員會費
1 劉一 admin 北京朝陽 250.0
2 趙二 123456 北京豐臺 360.0
3 薛三 000000 北京大興 470.0
5 陸四 888888 NaN NaN
import pandas as pd
from pandas import Series
member1=Series([1,350],index=["會員級別","會員最低消費"])
member2=Series([2,100,10],index=["會員購買產品次數","會員卡最低存額","會員活動次數"])
member3=Series([2],index=["會員推薦人數"])
member=pd.concat([member1,member2,member3])
print(member)
會員級別 1
會員最低消費 350
會員購買產品次數 2
會員卡最低存額 100
會員活動次數 10
會員推薦人數 2
dtype: int64
import pandas as pd
from pandas import Series
member1=Series([1,350],index=["會員級別","會員最低消費"])
member2=Series([2,100,10],index=["會員購買產品次數","會員卡最低存額","會員活動次數"])
member3=Series([2],index=["會員推薦人數"])
member=pd.concat([member1,member2,member3],axis=1)
print(member)# 從結果上看,輸出了DataFrame數據結構,這是由于傳入axis=1,
# 結果就會變成一個DataFrame,其中的axis=1是列,沒有數據的地方就
# 會顯示NaN。concat()在這里也體現了并運算,把所有不同的維度連接
# 在了一起。在這種情況下,另外一條軸上沒有重疊,從索引的有序并
# 集(外連接)上就可以看出來。
0 1 2
會員級別 1.0 NaN NaN
會員最低消費 350.0 NaN NaN
會員購買產品次數 NaN 2.0 NaN
會員卡最低存額 NaN 100.0 NaN
會員活動次數 NaN 10.0 NaN
會員推薦人數 NaN NaN 2.0
import pandas as pd
from pandas import Series
member1=Series([1,350],index=["會員級別","會員最低消費"])
member2=Series([1,100,10,2],index=["會員級別","會員卡最低存額","會員活動次數","會員推薦人數"])
member3=Series([1,350,2],index=["會員級別","會員最低消費","會員推薦人數"])
member=pd.concat([member1,member2,member3],axis=1,join="inner")
print(member)
0 1 2
會員級別 1 1 1
# 代碼中使用groupby()方法對“會員消費”與“會員級別”組成的
# DataFrame數據進行分組,分組參照的維度為“會員級別”。sum()方
# 法的作用是對分組之后的“會員消費”進行求和運算,旨在統計不同
# 的會員級別的消費情況import pandas as pd
from pandas import DataFrame
member=DataFrame({"會員級別":[1,2,5,3,1,1,2,5,2,3,1,1,2,3,5,4],"會員消費情況":[100,500,2500,1427,90,90,490,2498,486,1315,89,97,490,1489,2389,1900]})
member_group=member.groupby("會員級別").sum()
print(member_group)
會員消費情況
會員級別
1 466
2 1966
3 4231
4 1900
5 7387
import pandas as pd
from pandas import DataFrame
member=DataFrame({"會員級別":[1,2,5,3,1,1,2,5,2,3,1,1,2,3,5,4],"會員消費情況":[100,500,2500,1427,90,90,490,2498,486,1315,89,97,490,1489,2389,1900],"會員參與活動數目":[1,3,10,5,3,3,6,8,4,2,3,3,6,5,4,1]})
member_group=member.groupby(["會員級別","會員參與活動數目"]).sum()
print(member_group)
會員消費情況
會員級別 會員參與活動數目
1 1 1003 366
2 3 5004 4866 980
3 2 13155 2916
4 1 1900
5 4 23898 249810 2500
import pandas as pd
import numpy as np
from pandas import DataFrame
member=DataFrame({"會員級別":[1,2,5,3,1,1,2,5,2,3,1,1,2,3,5,4],"會員消費情況":[100,500,2500,1427,90,90,490,2498,486,1315,89,97,490,1489,2389,1900],"會員參與活動數目":[1,3,10,5,3,3,6,8,4,2,3,3,6,5,4,1]})
member_group=member.groupby("會員級別").agg([np.sum,np.mean,np.std])
print(member_group)
會員消費情況 會員參與活動數目 sum mean std sum mean std
會員級別
1 466 93.200000 4.969909 13 2.600000 0.894427
2 1966 491.500000 5.972158 19 4.750000 1.500000
3 4231 1410.333333 88.189191 12 4.000000 1.732051
4 1900 1900.000000 NaN 1 1.000000 NaN
5 7387 2462.333333 63.516402 22 7.333333 3.055050
# 代碼中pivot_table后面的參數,第一個參數是需要進行透視表操
# 作的DataFrame數據,第二個參數是建立透視表時以“會員級別”維度
# 作為索引,第三個參數是統計的時候的運算方法,如是求和還是求平
# 均數等。這里是求和
import pandas as pd
import numpy as np
from pandas import DataFrame
member=DataFrame({"會員級別":[1,2,5,3,1,1,2,5,2,3,1,1,2,3,5,4],"會員消費情況":[100,500,2500,1427,90,90,490,2498,486,1315,89,97,490,1489,2389,1900],"會員參與活動數目":[1,3,10,5,3,3,6,8,4,2,3,3,6,5,4,1]})
member_table=pd.pivot_table(member,index=["會員級別"],aggfunc=
[np.sum])
print(member_table)
sum 會員參與活動數目 會員消費情況
會員級別
1 13 466
2 19 1966
3 12 4231
4 1 1900
5 22 7387