電商數據建模
一、分析背景與目的
1.1 背景介紹
電商平臺數據分析是最為典型的一個數據分析賽道,且電商數據分析有著比較成熟的數據分析模型,比如:人貨場模型。此文中我將通過分析國內最大的電商平臺——淘寶的用戶行為,來鞏固數據分析技能以及思維。通過分析用戶行為,以此來實現精準營銷,總結現有問題,獲得業務增長 |
1.2 數據說明
該數據包含了20230523用戶一天購物行為的所有數據,主要包括人貨場三個維度信息。用戶維度、商品維度、地區維度構成一個用戶下單的行為事實表。 |
1.3數據分析流程
提出業務問題—確認粒度—數據處理和清洗—構建模型—數據可視化——根據數據可視化分析解決問題
業務問題:
1.如何提高品牌銷售力度?
2.如何刺激地區市場消費潛力?
3.如何刺激用戶消費?
4.如何減少產品成本?
確認粒度:
用戶信息表、訂單表、訂單明細表
商品信息表、品牌信息表、一二三級分類信息表
省份信息表、地區信息表
核心算法代碼分享如下:
import sysimport numpy as np
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
import torch
from sklearn.metrics import precision_score,recall_score,accuracy_scoreimport dataloaderclass ALS_MLP (nn.Module):def __init__(self, n_users, n_items, dim):super(ALS_MLP, self).__init__()''':param n_users: 用戶數量:param n_items: 物品數量:param dim: 向量維度'''# 隨機初始化用戶的向量,self.users = nn.Embedding( n_users, dim, max_norm=1 )# 隨機初始化物品的向量self.items = nn.Embedding( n_items, dim, max_norm=1 )#初始化用戶向量的隱層self.u_hidden_layer1 = self.dense_layer(dim, dim // 2)self.u_hidden_layer2 = self.dense_layer(dim//2, dim // 4)#初始化物品向量的隱層self.i_hidden_layer1 = self.dense_layer(dim, dim // 2)self.i_hidden_layer2 = self.dense_layer(dim//2, dim // 4)self.sigmoid = nn.Sigmoid()def dense_layer(self,in_features,out_features):#每一個mlp單元包含一個線性層和激活層,當前代碼中激活層采取Tanh雙曲正切函數。return nn.Sequential(nn.Linear(in_features, out_features),nn.Tanh())def forward(self, u, v, isTrain=True):''':param u: 用戶索引id shape:[batch_size]:param i: 用戶索引id shape:[batch_size]:return: 用戶向量與物品向量的內積 shape:[batch_size]'''u = self.users(u)v = self.items(v)u = self.u_hidden_layer1(u)u = self.u_hidden_layer2(u)v = self.i_hidden_layer1(v)v = self.i_hidden_layer2(v)#訓練時采取dropout來防止過擬合if isTrain:u = F.dropout(u)v = F.dropout(v)uv = torch.sum( u*v, axis = 1)logit = self.sigmoid(uv*3)return logitdef doEva(net, d):d = torch.LongTensor(d)u, i, r = d[:, 0], d[:, 1], d[:, 2]with torch.no_grad():out = net(u,i,False)y_pred = np.array([1 if i >= 0.5 else 0 for i in out])y_true = r.detach().numpy()p = precision_score(y_true, y_pred)r = recall_score(y_true, y_pred)acc = accuracy_score(y_true,y_pred)return p,r,accdef train( epochs = 10, batchSize = 1024, lr = 0.001, dim = 256, eva_per_epochs = 1):''':param epochs: 迭代次數:param batchSize: 一批次的數量:param lr: 學習率:param dim: 用戶物品向量的維度:param eva_per_epochs: 設定每幾次進行一次驗證'''#讀取數據user_set, item_set, train_set, test_set = \dataloader.readRecData(test_ratio = 0.1)#初始化ALS模型net = ALS_MLP(len(user_set), len(item_set), dim)#定義優化器optimizer = torch.optim.AdamW( net.parameters(), lr = lr, weight_decay=0.2)#定義損失函數criterion = torch.nn.BCELoss()#開始迭代for e in range(epochs):all_lose = 0#每一批次地讀取數據for u, i, r in DataLoader(train_set,batch_size = batchSize, shuffle = True):optimizer.zero_grad()r = torch.FloatTensor(r.detach().numpy())result = net(u,i)loss = criterion(result,r)all_lose += lossloss.backward()optimizer.step()print('epoch {}, avg_loss = {:.4f}'.format(e,all_lose/(len(train_set)//batchSize)))#評估模型if e % eva_per_epochs==0:p, r, acc = doEva(net, train_set)print('train: Precision {:.4f} | Recall {:.4f} | accuracy {:.4f}'.format(p, r, acc))p, r, acc = doEva(net, test_set)print('test: Precision {:.4f} | Recall {:.4f} | accuracy {:.4f}'.format(p, r, acc))def als_mlp_predict(userId=1, itemSize=100, count=4, dim=64):# 讀取數據user_set, item_set, train_set, test_set = \dataloader.readRecData( test_ratio=0.1)# 預測一個用戶的所有的評分形成一個元祖train_set = []for i in range(1, itemSize):train_set.append((userId, i, 0))# print(train_set)# print(train_set)# 初始化ALS模型net = ALS_MLP(len(user_set), len(item_set), dim)d = torch.LongTensor(train_set)u, i, r = d[:, 0], d[:, 1], d[:, 2]with torch.no_grad():out = net(u, i)predict = []preds = out.tolist()# print(len(preds))# 找出最大值,通過這種方式可以求出多個for i in range(0, count):m = max(preds)idx = preds.index(m)predict.append(dict(iid=idx, score=m))del preds[idx]# print(predict)return predictdef test(dim = 64):result = als_mlp_predict(1, 2000, 5)print(result)if __name__ == '__main__':# train()# test()param1 = sys.argv[1]# param1 = "1"result = als_mlp_predict(int(param1), 55, 4)list = []# print(result)for r in result:list.append(dict(iid=r['iid'], rate=r['score']))print(list)