day10_會員主題域開發
會員主題_DWS和ADS層
DWS層開發
門店會員分類天表:
維度指標:
指標:新增注冊會員數、累計注冊會員數、新增消費會員數、累計消費會員數、新增復購會員數、累計復購會員數、活躍會員數、沉睡會員數、會員消費金額 維度: 時間維度(天、周、月) ? 涉及表: 門店會員分類天表 ? 表字段的組成: 維度字段 + 指標結果字段
建表語句:
CREATE TABLE IF NOT EXISTS dws.dws_mem_store_member_classify_day_i(trade_date ? ? ? ? ? ? ? ? ? STRING COMMENT '統計時間',week_trade_date ? ? ? ? ? ? STRING COMMENT '周一日期',month_trade_date ? ? ? ? ? STRING COMMENT '月一日期',
?store_no ? ? ? ? ? ? ? ? ? STRING COMMENT '店鋪編碼',store_name ? ? ? ? ? ? ? ? STRING COMMENT '店鋪名稱',store_sale_type ? ? ? ? ? ? BIGINT COMMENT '店鋪銷售類型',store_type_code ? ? ? ? ? ? BIGINT COMMENT '分店類型',city_id ? ? ? ? ? ? ? ? ? ? BIGINT COMMENT '城市ID',city_name ? ? ? ? ? ? ? ? ? STRING COMMENT '城市名稱',region_code ? ? ? ? ? ? ? ? STRING COMMENT '區域編碼',region_name ? ? ? ? ? ? ? ? STRING COMMENT '區域名稱',is_day_clear ? ? ? ? ? ? ? ?BIGINT COMMENT '是否日清:0否,1是',
?reg_num_add ? ? ? ? ? ? ? ? BIGINT COMMENT '新增注冊會員數',reg_num_sum ? ? ? ? ? ? ? ? BIGINT COMMENT '累計注冊會員數',consume_num_add ? ? ? ? ? ? BIGINT COMMENT '新增消費會員數',consume_num_sum ? ? ? ? ? ? BIGINT COMMENT '累計消費會員數',repurchase_num_add ? ? ? ? ?BIGINT COMMENT '新增復購會員數',repurchase_num_sum ? ? ? ? ?BIGINT COMMENT '累計復購會員數',active_member_num ? ? ? ? ? BIGINT COMMENT '活躍會員數',sleep_member_num ? ? ? ? ? ?BIGINT COMMENT '沉睡會員數',sale_amount_bind ? ? ? ? ? ?DECIMAL(27, 2) COMMENT '會員消費金額'
)
comment '門店會員分類天表'
partitioned by (dt STRING COMMENT '統計日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
活躍會員:30天內有消費
沉睡會員:90天內有消費,30天內沒有消費
這個需求的難點在于計算累計值。思路是 使用天進行聚合,得到每天的值,然后使用sum () over 窗口,得到累加值,對于每天的累積情況,這里需要使用拉鏈表的思想,即構造一個生效日期,這里使用lead() over 窗口函數,取到下一個日期,然后再用這個當日時間去卡,即可得到當日對應的累計值。
-- DWS層: 門店會員分類天
-- 注意: 以下內容僅僅以2023-11-14為例,實際需要把14-20日的所有數據都要導入對應表中
with t1 as (selecttrade_date as start_date,store_no,reg_num_add, -- 新增注冊會員數sum(reg_num_add) over(partition by store_no order by trade_date) as reg_num_sum, -- 累計注冊會員數lead(trade_date,1,'9999-99-99') over (partition by store_no order by trade_date) as end_datefrom( -- 先統計每天注冊會員數selecttrade_date,reg_md as store_no,count(1) as reg_num_addfrom dwm.dwm_mem_member_behavior_day_iwhere is_register = 1group bytrade_date, reg_md) temp1
),
t2 as (selecttrade_date as start_date,store_no,consume_num_add, -- 新增消費會員數sum(consume_num_add) over(partition by store_no order by trade_date) as consume_num_sum, -- 累計消費會員數lead(trade_date,1,'9999-99-99') over (partition by store_no order by trade_date) as end_datefrom (selecttrade_date,store_no,count(1) as consume_num_addfrom dwm.dwm_mem_first_buy_igroup by trade_date, store_no) temp2
),
t3 as (selecttrade_date as start_date,store_no,repurchase_num_add, -- 新增充值會員數sum(repurchase_num_add) over(partition by store_no order by trade_date) as repurchase_num_sum, -- 累計充值會員數lead(trade_date,1,'9999-99-99') over (partition by store_no order by trade_date) as end_datefrom (selecttrade_date,store_no,count(1) as repurchase_num_addfrom dwm.dwm_mem_second_buy_igroup by trade_date, store_no) temp2
),
t4 as (-- 活躍會員數(最近30天有消費) 2023-11-14select'2023-11-14' as trade_date,bind_md as store_no,count(distinct zt_id) as active_member_numfrom dwm.dwm_mem_member_behavior_day_iwhere trade_date <= '2023-11-14' and trade_date >= date_sub('2023-11-14',30) and is_consume = 1group by bind_md
),
t5 as (-- 沉睡會員數: 最近90天有消費 , 但是最近30天無消費select'2023-11-14' as trade_date,temp3.bind_md as store_no,count(temp3.zt_id) as sleep_member_numfrom(selectbind_md,zt_idfrom dwm.dwm_mem_member_behavior_day_iwhere trade_date <= '2023-11-14' and trade_date >= date_sub('2023-11-14',90) and is_consume = 1group by bind_md,zt_id) temp3LEFT JOIN(selectbind_md,zt_idfrom dwm.dwm_mem_member_behavior_day_iwhere trade_date <= '2023-11-14' and trade_date >= date_sub('2023-11-14',30) and is_consume = 1group by bind_md,zt_id) temp4 on temp3.bind_md = temp4.bind_md and temp3.zt_id = temp4.zt_idwhere temp4.zt_id is nullgroup by temp3.bind_md
),
t6 as (selecttrade_date,store_no,sum(real_paid_amount) as sale_amount_bindfrom dwm.dwm_mem_sell_order_iwhere trade_date = '2023-11-14'group by trade_date,store_no
),
t7 as (select'2023-11-14' as trade_date,store_no,if(start_date = '2023-11-14',reg_num_add,0) as reg_num_add,reg_num_sum,0 as consume_num_add,0 as consume_num_sum,0 as repurchase_num_add,0 as repurchase_num_sum,0 as active_member_num,0 as sleep_member_num,0 as sale_amount_bindfrom t1where start_date <= '2023-11-14' and end_date >= '2023-11-14'union allselect'2023-11-14' as trade_date,store_no,0 reg_num_add,0 as reg_num_sum,if( start_date = '2023-11-14',consume_num_add,0) as consume_num_add,consume_num_sum,0 as repurchase_num_add,0 as repurchase_num_sum,0 as active_member_num,0 as sleep_member_num,0 as sale_amount_bindfrom t2where start_date <= '2023-11-14' and end_date >= '2023-11-14'union allselect'2023-11-14' as trade_date,store_no,0 reg_num_add,0 as reg_num_sum,0 as consume_num_add,0 as consume_num_sum,if(start_date = '2023-11-14',repurchase_num_add,0) as repurchase_num_add,repurchase_num_sum,0 as active_member_num,0 as sleep_member_num,0 as sale_amount_bindfrom t3where start_date <= '2023-11-14' and end_date >= '2023-11-14'union allselecttrade_date,store_no,0 reg_num_add,0 as reg_num_sum,0 as consume_num_add,0 as consume_num_sum,0 as repurchase_num_add,0 as repurchase_num_sum,active_member_num,0 as sleep_member_num,0 as sale_amount_bindfrom t4union allselecttrade_date,store_no,0 reg_num_add,0 as reg_num_sum,0 as consume_num_add,0 as consume_num_sum,0 as repurchase_num_add,0 as repurchase_num_sum,0 as active_member_num,sleep_member_num,0 as sale_amount_bindfrom t5union allselecttrade_date,store_no,0 reg_num_add,0 as reg_num_sum,0 as consume_num_add,0 as consume_num_sum,0 as repurchase_num_add,0 as repurchase_num_sum,0 as active_member_num,0 as sleep_member_num,sale_amount_bindfrom t6
)
-- insert overwrite table dws.dws_mem_store_member_classify_day_i partition (dt)
selectt7.trade_date,t8.week_trade_date,t8.month_trade_date,t7.store_no,t9.store_name,t9.store_sale_type,t9.store_type_code,t9.city_id,t9.city_name,t9.region_code,t9.region_name,t9.is_day_clear,sum(t7.reg_num_add) as reg_num_add,sum(t7.reg_num_sum) as reg_num_sum,sum(t7.consume_num_add) as consume_num_add,sum(t7.consume_num_sum) as consume_num_sum,sum(t7.repurchase_num_add) as repurchase_num_add,sum(t7.repurchase_num_sum) as repurchase_num_sum,sum(t7.active_member_num) as active_member_num,sum(t7.sleep_member_num) as sleep_member_num,sum(t7.sale_amount_bind) as sale_amount_bind,t7.trade_date as dt
from t7left join dim.dwd_dim_date_f t8 on t7.trade_date = t8.trade_date-- 注意: 一定要檢查自己的dwd_dim_store_i分區目錄,此處填寫自己的分區目錄時間left join dim.dwd_dim_store_i t9 on t7.store_no = t9.store_no and t9.dt ='2023-11-23'
group byt7.trade_date,t8.week_trade_date,t8.month_trade_date,t7.store_no,t9.store_name,t9.store_sale_type,t9.store_type_code,t9.city_id,t9.city_name,t9.region_code,t9.region_name,t9.is_day_clear;
門店會員統計天表:
維度指標:
指標: 門店銷售額、門店總訂單量、當日注冊人數、累計注冊會員數、當日注冊且充值會員數、當日注冊且充值且消費會員數、當日注冊且消費會員數、充值會員數、充值金額、累計會員充值金額、當日有余額的會員人數、當日會員余額、余額消費人數/單量、余額支付金額、余額消費金額、會員消費人數/單量、會員消費金額、會員首單人數/訂單量/銷售額、會員非首單人數/訂單量/銷售額 維度: 時間維度(天、周、月)涉及表:門店會員統計天表表字段的組成: 維度字段 + 指標結果字段
建表語句:
CREATE TABLE IF NOT EXISTS dws.dws_mem_store_member_statistics_day_i(trade_date STRING COMMENT '統計時間',week_trade_date STRING COMMENT '周一日期',month_trade_date STRING COMMENT '月一日期',store_no STRING COMMENT '店鋪編碼',store_name STRING COMMENT '店鋪名稱',store_sale_type BIGINT COMMENT '店鋪銷售類型',store_type_code BIGINT COMMENT '分店類型',city_id BIGINT COMMENT '城市ID',city_name STRING COMMENT '城市名稱',region_code STRING COMMENT '區域編碼',region_name STRING COMMENT '區域名稱',is_day_clear BIGINT COMMENT '是否日清:0否,1是',store_sale_amount DECIMAL(27, 2) COMMENT '門店銷售金額',store_orders_number BIGINT COMMENT '門店總訂單量',register_member_num BIGINT COMMENT '當日注冊人數',register_member_num_all BIGINT COMMENT '累計注冊會員數',register_recharge_num BIGINT COMMENT '當日注冊且充值會員數',rg_rc_td_num BIGINT COMMENT '當日注冊且充值且消費會員數',register_trade_num BIGINT COMMENT '當日注冊且消費會員數',recharge_member_num BIGINT COMMENT '充值會員數',recharge_amount DECIMAL(27, 2) COMMENT '充值金額',recharge_amount_all DECIMAL(27, 2) COMMENT '累計會員充值金額',remain_member_num BIGINT COMMENT '當日有余額的會員人數',remain_member_amount DECIMAL(27, 2) COMMENT '當日會員余額',balance_member_num BIGINT COMMENT '余額消費人數',balance_member_order_num BIGINT COMMENT '余額消費單量',balance_pay_amount DECIMAL(27, 2) COMMENT '余額支付金額',balance_member_amount DECIMAL(27, 2) COMMENT '余額消費金額',member_num BIGINT COMMENT '會員消費人數',member_order_num BIGINT COMMENT '會員消費單量',member_amount DECIMAL(27, 2) COMMENT '會員消費金額',member_first_num BIGINT COMMENT '會員首單人數',member_first_order_num BIGINT COMMENT '會員首單訂單量',member_first_amount DECIMAL(27, 2) COMMENT '會員首單銷售額',member_nofirst_num BIGINT COMMENT '會員非首單人數',member_nofirst_order_num BIGINT COMMENT '會員非首單訂單量',member_nofirst_amount DECIMAL(27, 2) COMMENT '會員非首單銷售額'
)
comment '門店會員統計日表'
partitioned by (dt STRING COMMENT '統計日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
門店的消費情況可以從dwm_sell_o2o_order_i表中出,
注冊、充值、消費這些數據可以從dwm_mem_member_behavior_day_i中出,
余額數據可以從dwd_mem_balance_online_i中出。
需要注意的是,這里有新增的指標還有累計的指標,為了方便計算,可以分開求解。
新增指標可以大部分從dwm_mem_member_behavior_day_i中出,因為 dwm_mem_member_behavior_day_i是會員粒度的表,記錄了會員的各種行為。在計算會員指標的時候,很多需要count()來計算的指標,可以轉化成sum(1),根據條件進行判斷即可。
-- DWS 門店會員統計寬表
-- 注意: 以下內容僅僅以2023-11-14為例,實際需要把14-20日的所有數據都要導入對應表中
with t1 as (selecttrade_date,store_no,sum(real_paid_amount) as store_sale_amount,count(if(trade_type = 0,parent_order_no,NULL)) - count(if(trade_type = 5,parent_order_no,NULL)) as store_orders_number,0 as register_member_num,0 as register_member_num_all,0 as register_recharge_num,0 as rg_rc_td_num,0 as register_trade_num,0 as recharge_member_num,0 as recharge_amount,0 as recharge_amount_all,0 as remain_member_num,0 as remain_member_amount,0 as balance_member_num,0 as balance_member_order_num,0 as balance_pay_amount,0 as balance_member_amount,0 as member_num,0 as member_order_num,0 as member_amount,0 as member_first_num,0 as member_first_order_num,0 as member_first_amount,0 as member_nofirst_num,0 as member_nofirst_order_num,0 as member_nofirst_amountfrom dwm.dwm_sell_o2o_order_i where dt = '2023-11-14'group by trade_date,store_nounion allselecttrade_date,bind_md as store_no,0 as store_sale_amount, -- 門店銷售額0 as store_orders_number, -- 門店總訂單量sum(is_register) as register_member_num, -- 當日注冊人數0 as register_member_num_all, -- 累計注冊人數sum(if(is_register = 1 and is_recharge = 1, 1,0)) as register_recharge_num, -- 當日注冊且充值會員數sum(if(is_register = 1 and is_recharge = 1 and is_consume = 1, 1,0)) as rg_rc_td_num, -- 當日注冊 且充值且消費會員數sum(if(is_register = 1 and is_consume = 1, 1,0)) as register_trade_num, -- 當日注冊且消費會員數sum(is_recharge) as recharge_member_num, -- 充值會員數sum(if( is_recharge = 1,recharge_amount,0) ) as recharge_amount, -- 充值金額0 as recharge_amount_all, -- 累計會員充值金額0 as remain_member_num, -- 當日有余額的會員人數0 as remain_member_amount, -- 當日會員余額sum(is_balance_consume) as balance_member_num, --余額消費人數sum(if(is_balance_consume = 1, balance_consume_times, 0)) as balance_member_order_num, --余額消費單量sum(if(is_balance_consume = 1, balance_pay_amount, 0)) as balance_pay_amount, -- 余額支付金額sum(if(is_balance_consume = 1, balance_consume_amount, 0)) as balance_member_amount, -- 余額消費金額sum(is_consume) as member_num, -- 會員消費人數sum(if(is_consume = 1, consume_times, 0)) as member_order_num, -- 會員消費單量sum(if(is_consume = 1, consume_amount, 0)) as member_amount, -- 會員消費金額sum(is_first_consume) as member_first_num, -- 會員首單人數sum(is_first_consume) as member_first_order_num, -- 會員首單訂單量sum(if(is_first_consume = 1, first_consume_amount,0)) as member_first_amount, -- 會員首單銷售額sum(is_consume) - sum(is_first_consume) as member_nofirst_num, -- 會員非首單人數sum(if(is_consume = 1, consume_times, 0)) - sum(is_first_consume) as member_nofirst_order_num, -- 會員非首單訂單量sum(if(is_consume = 1, consume_amount, 0)) - sum(if(is_first_consume = 1, first_consume_amount,0)) as member_nofirst_amount -- 會員非首單銷售額from dwm.dwm_mem_member_behavior_day_i where dt = '2023-11-14'group by trade_date,bind_mdunion allselecttrade_date,store_no,0 as store_sale_amount,0 as store_orders_number,0 as register_member_num,0 as register_member_num_all,0 as register_recharge_num,0 as rg_rc_td_num,0 as register_trade_num,0 as recharge_member_num,0 as recharge_amount,0 as recharge_amount_all,count(1) as remain_member_num,sum(balance_amount) as remain_member_amount,0 as balance_member_num,0 as balance_member_order_num,0 as balance_pay_amount,0 as balance_member_amount,0 as member_num,0 as member_order_num,0 as member_amount,0 as member_first_num,0 as member_first_order_num,0 as member_first_amount,0 as member_nofirst_num,0 as member_nofirst_order_num,0 as member_nofirst_amountfrom dwd.dwd_mem_balance_online_i where dt = '2023-11-14'group by trade_date,store_nounion allselectstart_date as trade_date,store_no,0 as store_sale_amount,0 as store_orders_number,0 as register_member_num,register_member_num_all,0 as register_recharge_num,0 as rg_rc_td_num,0 as register_trade_num,0 as recharge_member_num,0 as recharge_amount,recharge_amount_all,0 as remain_member_num,0 as remain_member_amount,0 as balance_member_num,0 as balance_member_order_num,0 as balance_pay_amount,0 as balance_member_amount,0 as member_num,0 as member_order_num,0 as member_amount,0 as member_first_num,0 as member_first_order_num,0 as member_first_amount,0 as member_nofirst_num,0 as member_nofirst_order_num,0 as member_nofirst_amountfrom(selecttrade_date as start_date,store_no,sum(reg_num_add) over(partition by store_no order by trade_date) as register_member_num_all, -- 累計注冊會員數sum(recharge_amount) over(partition by store_no order by trade_date) as recharge_amount_all, -- 累計充值金額lead(trade_date,1,'9999-99-99') over (partition by store_no order by trade_date) as end_datefrom( -- 先統計每天注冊會員數selecttrade_date,bind_md as store_no,sum(is_register) as reg_num_add,sum(if(is_recharge = 1,recharge_amount,0)) as recharge_amountfrom dwm.dwm_mem_member_behavior_day_igroup bytrade_date, bind_md) temp1) twhere start_date <= '2023-11-14' and end_date >= '2023-11-14'
)
insert overwrite table dws.dws_mem_store_member_statistics_day_i partition(dt)
selectt1.trade_date,t2.week_trade_date,t2.month_trade_date,t1.store_no,t3.store_name,t3.store_sale_type,t3.store_type_code,t3.city_id,t3.city_name,t3.region_code,t3.region_name,t3.is_day_clear,sum(t1.store_sale_amount) as store_sale_amount,sum(t1.store_orders_number) as store_orders_number,sum(t1.register_member_num) as register_member_num,sum(t1.register_member_num_all) as register_member_num_all,sum(t1.register_recharge_num) as register_recharge_num,sum(t1.rg_rc_td_num) as rg_rc_td_num,sum(t1.register_trade_num) as register_trade_num,sum(t1.recharge_member_num) as recharge_member_num,sum(t1.recharge_amount) as recharge_amount,sum(t1.recharge_amount_all) as recharge_amount_all,sum(t1.remain_member_num) as remain_member_num,sum(t1.remain_member_amount) as remain_member_amount,sum(t1.balance_member_num) as balance_member_num,sum(t1.balance_member_order_num) as balance_member_order_num,sum(t1.balance_pay_amount) as balance_pay_amount,sum(t1.balance_member_amount) as balance_member_amount,sum(t1.member_num) as member_num,sum(t1.member_order_num) as member_order_num,sum(t1.member_amount) as member_amount,sum(t1.member_first_num) as member_first_num,sum(t1.member_first_order_num) as member_first_order_num,sum(t1.member_first_amount) as member_first_amount,sum(t1.member_nofirst_num) as member_nofirst_num,sum(t1.member_nofirst_order_num) as member_nofirst_order_num,sum(t1.member_nofirst_amount) as member_nofirst_amount,t1.trade_date as dt
from t1left join dim.dwd_dim_date_f t2 on t1.trade_date = t2.trade_date-- 注意: 一定要檢查自己的dwd_dim_store_i分區目錄,此處填寫自己的分區目錄時間left join dim.dwd_dim_store_i t3 on t1.store_no = t3.store_no and t3.dt ='2023-11-23'
group byt1.trade_date,t2.week_trade_date,t2.month_trade_date,t1.store_no,t3.store_name,t3.store_sale_type,t3.store_type_code,t3.city_id,t3.city_name,t3.region_code,t3.region_name,t3.is_day_clear;
ADS層開發
回顧dayofweek函數
-- dayofweek
-- 注意: dayofweek是老外從周日算,所以返回的結果和咱們中國人思路差1天
select dayofweek('2023-12-7');
-- 需求: 獲取到2023-12-7所在的周中的周一日期
select date_sub('2023-12-7',if(dayofweek('2023-12-7')=1,6,dayofweek('2023-12-7')-2));-- 需求: 獲取到2023-12-7所在的周中的周日日期
select date_sub('2023-12-8',if(dayofweek('2023-12-8')=1,0,dayofweek('2023-12-8')-8));-- day0fmonth
select dayofmonth('2023-12-07');
-- 需求: 獲取2023-12-7所在月的第一天的日期
select date_sub('2023-12-07',dayofmonth('2023-12-07')-1);
-- 需求: 獲取2023-12-7所在月的最后一天的日期
select last_day('2023-12-07');
各類會員數量統計分析
維度指標:
指標:新增注冊會員數、累計注冊會員數、新增消費會員數、累計消費會員數、新增復購會員數、累計復購會員數、活躍會員數、沉睡會員數、會員消費金額 維度: 時間維度(天、周、月)涉及ADS表:門店會員分類月表 和 門店會員分類周表表字段的組成: 維度字段 + 指標結果字段
門店會員分類周表
建表語句:
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_member_classify_week_i(trade_date STRING COMMENT '周一日期',store_no STRING COMMENT '店鋪編碼',store_name STRING COMMENT '店鋪名稱',store_sale_type BIGINT COMMENT '店鋪銷售類型',store_type_code BIGINT COMMENT '分店類型',city_id BIGINT COMMENT '城市ID',city_name STRING COMMENT '城市名稱',region_code STRING COMMENT '區域編碼',region_name STRING COMMENT '區域名稱',is_day_clear BIGINT COMMENT '是否日清:0否,1是',reg_num_add BIGINT COMMENT '新增注冊會員數',reg_num_sum BIGINT COMMENT '累計注冊會員數',consume_num_add BIGINT COMMENT '新增消費會員數',consume_num_sum BIGINT COMMENT '累計消費會員數',repurchase_num_add BIGINT COMMENT '新增復購會員數',repurchase_num_sum BIGINT COMMENT '累計復購會員數',active_member_num BIGINT COMMENT '活躍會員數',sleep_member_num BIGINT COMMENT '沉睡會員數',sale_amount_bind DECIMAL(27, 2) COMMENT '會員消費金額'
)
comment '門店會員分類周表'
partitioned by (dt STRING COMMENT '統計日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
指標分為累計值和新增值,累計值可以取當周最后一天的數值、新增值可以進行聚合得到。 ? 需要注意的是,這里計算的是一張周表,所以當考慮到數據的場景時,需要取到當周所有的數據進行聚合,以及取到當周最后一天進行取累加值。
思考: 當計算某一天對應這一周的指標, 如果獲取這一周相關的數據呢?
where t.dt in (select max(dt) from dws.dws_mem_store_member_classify_day_iwhere dt>=date_sub('${inputdate}', if (dayofweek('${inputdate}') = 1, 6, dayofweek('${inputdate}') - 2)) and dt<=date_sub('${inputdate}', if (dayofweek('${inputdate}') = 1, 0, dayofweek('${inputdate}') - 8))
)
代碼實現:
with t1 as (
-- 計算非累加值
selectweek_trade_date,store_no,sum(reg_num_add) as reg_num_add,sum(consume_num_add) as consume_num_add,sum(repurchase_num_add) as repurchase_num_add,sum(sale_amount_bind) as sale_amount_bind
from dws.dws_mem_store_member_classify_day_i
where dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8))
group by week_trade_date,store_no
),
t2 as (
-- 計算 累計值
-- 如果獲取這一周的最后一天呢?selectweek_trade_date as trade_date,store_no,store_name,store_sale_type,store_type_code,city_id,city_name,region_code,region_name,is_day_clear,reg_num_sum,consume_num_sum,repurchase_num_sum,active_member_num,sleep_member_numfrom dws.dws_mem_store_member_classify_day_i where dt in (selectmax(dt) as c1from dws.dws_mem_store_member_classify_day_i as twhere dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8)))
)
insert overwrite table ads.ads_mem_store_member_classify_week_i partition (dt)
selectt2.trade_date,t2.store_no,t2.store_name,t2.store_sale_type,t2.store_type_code,t2.city_id,t2.city_name,t2.region_code,t2.region_name,t2.is_day_clear,t1.reg_num_add,t2.reg_num_sum,t1.consume_num_add,t2.consume_num_sum,t1.repurchase_num_add,t2.repurchase_num_sum,t2.active_member_num,t2.sleep_member_num,t1.sale_amount_bind,t2.trade_date as dt
from t2 left join t1 on t2.trade_date = t1.week_trade_date and t2.store_no = t1.store_no;
門店會員分類月表
建表語句:
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_member_classify_month_i(trade_date STRING COMMENT '月一日期',store_no STRING COMMENT '店鋪編碼',store_name STRING COMMENT '店鋪名稱',store_sale_type BIGINT COMMENT '店鋪銷售類型',store_type_code BIGINT COMMENT '分店類型',city_id BIGINT COMMENT '城市ID',city_name STRING COMMENT '城市名稱',region_code STRING COMMENT '區域編碼',region_name STRING COMMENT '區域名稱',is_day_clear BIGINT COMMENT '是否日清:0否,1是',reg_num_add BIGINT COMMENT '新增注冊會員數',reg_num_sum BIGINT COMMENT '累計注冊會員數',consume_num_add BIGINT COMMENT '新增消費會員數',consume_num_sum BIGINT COMMENT '累計消費會員數',repurchase_num_add BIGINT COMMENT '新增復購會員數',repurchase_num_sum BIGINT COMMENT '累計復購會員數',active_member_num BIGINT COMMENT '活躍會員數',sleep_member_num BIGINT COMMENT '沉睡會員數',sale_amount_bind DECIMAL(27, 2) COMMENT '會員消費金額'
)
comment '門店會員分類月表'
partitioned by (dt STRING COMMENT '統計日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
處理思路: 同周表ads_mem_store_member_classify_week_i,改變下范圍即可
思考: 如果獲取一個月范圍的數據呢?
select date_sub('2023-09-30',dayofmonth('2023-09-30')-1), last_day('2023-09-30')
門店會員分析
維度指標:
指標: 門店銷售額、門店總訂單量、當日注冊人數、累計注冊會員數、當日注冊且充值會員數、當日注冊且充值且消費會員數、當日注冊且消費會員數、充值會員數、充值金額、累計會員充值金額、當日有余額的會員人數、當日會員余額、余額消費人數/單量、余額支付金額、余額消費金額、會員消費人數/單量、會員消費金額、會員首單人數/訂單量/銷售額、會員非首單人數/訂單量/銷售額 維度: 時間維度(天、周、月)涉及表: 門店會員統計周表 和 門店會員統計月表涉及表字段: 維度字段 + 指標結果字段
門店會員統計周表
建表語句:
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_member_statistics_week_i(trade_date STRING COMMENT '周一日期',store_no STRING COMMENT '店鋪編碼',store_name STRING COMMENT '店鋪名稱',store_sale_type BIGINT COMMENT '店鋪銷售類型',store_type_code BIGINT COMMENT '分店類型',city_id BIGINT COMMENT '城市ID',city_name STRING COMMENT '城市名稱',region_code STRING COMMENT '區域編碼',region_name STRING COMMENT '區域名稱',is_day_clear BIGINT COMMENT '是否日清:0否,1是',store_sale_amount DECIMAL(27, 2) COMMENT '門店銷售金額',store_orders_number BIGINT COMMENT '門店總訂單量',register_member_num BIGINT COMMENT '當日注冊人數',register_member_num_all BIGINT COMMENT '累計注冊會員數',register_recharge_num BIGINT COMMENT '當日注冊且充值會員數',rg_rc_td_num BIGINT COMMENT '當日注冊且充值且消費會員數',register_trade_num BIGINT COMMENT '當日注冊且消費會員數',recharge_member_num BIGINT COMMENT '充值會員數',recharge_amount DECIMAL(27, 2) COMMENT '充值金額',recharge_amount_all DECIMAL(27, 2) COMMENT '累計會員充值金額',remain_member_num BIGINT COMMENT '當周最后一天有余額的會員人數',remain_member_amount DECIMAL(27, 2) COMMENT '當周最后一天會員余額',balance_member_num BIGINT COMMENT '余額消費人數',balance_member_order_num BIGINT COMMENT '余額消費單量',balance_pay_amount DECIMAL(27, 2) COMMENT '余額支付金額',balance_member_amount DECIMAL(27, 2) COMMENT '余額消費金額',member_num BIGINT COMMENT '會員消費人數',member_order_num BIGINT COMMENT '會員消費單量',member_amount DECIMAL(27, 2) COMMENT '會員消費金額',member_first_num BIGINT COMMENT '會員首單人數',member_first_order_num BIGINT COMMENT '會員首單訂單量',member_first_amount DECIMAL(27, 2) COMMENT '會員首單銷售額',member_nofirst_num BIGINT COMMENT '會員非首單人數',member_nofirst_order_num BIGINT COMMENT '會員非首單訂單量',member_nofirst_amount DECIMAL(27, 2) COMMENT '會員非首單銷售額'
)
comment '門店會員統計周表'
partitioned by (dt STRING COMMENT '統計日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
指標分為三種情況:一種是狀態值,比如說累計指標,register_member_num_all等,還有狀態指標,remain_member_num等;另一種情況是可累加的指標,比如金額和單量等;還有一種情況是不可累積指標,比如人數。狀態值可以從最新的天表中dws_mem_store_member_statistics_day_i獲取。 然后以這張表作為主表,關聯其他表。 可累加的指標直接從dws_mem_store_member_statistics_day_i中進行聚合得到。 不可累加的指標從dwm_mem_member_behavior_day_i中進行計算得到。
代碼實現:
-- ads 門店會員統計周表
with t1 as (
-- 第一部分: 基于DWS層門店會員統計天表 獲取指定天的對應這一周的數據, 對這一周進行聚合統計
selectweek_trade_date as trade_date,store_no,sum(store_sale_amount) as store_sale_amount,sum(store_orders_number) as store_orders_number,sum(register_member_num) as register_member_num,sum(register_recharge_num) as register_recharge_num,sum(rg_rc_td_num) as rg_rc_td_num,sum(register_trade_num) as register_trade_num,sum(recharge_amount) as recharge_amount,sum(balance_member_order_num) as balance_member_order_num,sum(balance_pay_amount) as balance_pay_amount,sum(balance_member_amount) as balance_member_amount,sum(member_order_num) as member_order_num,sum(member_amount) as member_amount,sum(member_first_num) as member_first_num,sum(member_first_order_num) as member_first_order_num,sum(member_first_amount) as member_first_amount,sum(member_nofirst_order_num) as member_nofirst_order_num,sum(member_nofirst_amount) as member_nofirst_amountfrom dws.dws_mem_store_member_statistics_day_i
where dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8))
group by week_trade_date,store_no
),t2 as (selectweek_trade_date as trade_date,store_no,store_name,store_sale_type,store_type_code,city_id,city_name,region_code,region_name,is_day_clear,register_member_num_all,recharge_amount_all,remain_member_num,remain_member_amountfrom dws.dws_mem_store_member_statistics_day_i where dt in (selectmax(dt)from dws.dws_mem_store_member_statistics_day_i twhere dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8)))
),t3 as (selectweek_trade_date as trade_date,bind_md as store_no,count( DISTINCT if(is_recharge = 1,zt_id,NULL) ) AS recharge_member_num,count( DISTINCT if(is_balance_consume = 1,zt_id,NULL) ) AS balance_member_num,count( DISTINCT if(is_consume = 1,zt_id,NULL) ) AS member_num,count( DISTINCT if(is_first_consume = 0 and consume_times > 0,zt_id,NULL) ) AS member_nofirst_numfrom dwm.dwm_mem_member_behavior_day_iwhere dt >= date_sub('2023-11-14',if(dayofweek('2023-11-14') = 1,6, dayofweek('2023-11-14')-2 ))and dt <= date_add('2023-11-14',if(dayofweek('2023-11-14') = 1,0,-dayofweek('2023-11-14')+8))group by week_trade_date,bind_md
)insert overwrite table ads.ads_mem_store_member_statistics_week_i partition (dt)
selectt2.trade_date,t2.store_no,t2.store_name,t2.store_sale_type,t2.store_type_code,t2.city_id,t2.city_name,t2.region_code,t2.region_name,t2.is_day_clear,t1.store_sale_amount,t1.store_orders_number,t1.register_member_num,t2.register_member_num_all,t1.register_recharge_num,t1.rg_rc_td_num,t1.register_trade_num,t3.recharge_member_num,t1.recharge_amount,t2.recharge_amount_all,t2.remain_member_num,t2.remain_member_amount,t3.balance_member_num,t1.balance_member_order_num,t1.balance_pay_amount,t1.balance_member_amount,t3.member_num,t1.member_order_num,t1.member_amount,t1.member_first_num,t1.member_first_order_num,t1.member_first_amount,t3.member_nofirst_num,t1.member_nofirst_order_num,t1.member_nofirst_amount,t2.trade_date as dt
from t2 left join t1 on t2.trade_date = t1.trade_date and t2.store_no = t1.store_noleft join t3 on t2.trade_date = t3.trade_date and t2.store_no = t3.store_no;
門店會員統計月表
建表語句:
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_member_statistics_month_i(trade_date STRING COMMENT '月一日期',store_no STRING COMMENT '店鋪編碼',store_name STRING COMMENT '店鋪名稱',store_sale_type BIGINT COMMENT '店鋪銷售類型',store_type_code BIGINT COMMENT '分店類型',city_id BIGINT COMMENT '城市ID',city_name STRING COMMENT '城市名稱',region_code STRING COMMENT '區域編碼',region_name STRING COMMENT '區域名稱',is_day_clear BIGINT COMMENT '是否日清:0否,1是',store_sale_amount DECIMAL(27, 2) COMMENT '門店銷售金額',store_orders_number BIGINT COMMENT '門店總訂單量',register_member_num BIGINT COMMENT '當日注冊人數',register_member_num_all BIGINT COMMENT '累計注冊會員數',register_recharge_num BIGINT COMMENT '當日注冊且充值會員數',rg_rc_td_num BIGINT COMMENT '當日注冊且充值且消費會員數',register_trade_num BIGINT COMMENT '當日注冊且消費會員數',recharge_member_num BIGINT COMMENT '充值會員數',recharge_amount DECIMAL(27, 2) COMMENT '充值金額',recharge_amount_all DECIMAL(27, 2) COMMENT '累計會員充值金額',remain_member_num BIGINT COMMENT '當月最后一天有余額的會員人數',remain_member_amount DECIMAL(27, 2) COMMENT '當月最后一天會員余額',balance_member_num BIGINT COMMENT '余額消費人數',balance_member_order_num BIGINT COMMENT '余額消費單量',balance_pay_amount DECIMAL(27, 2) COMMENT '余額支付金額',balance_member_amount DECIMAL(27, 2) COMMENT '余額消費金額',member_num BIGINT COMMENT '會員消費人數',member_order_num BIGINT COMMENT '會員消費單量',member_amount DECIMAL(27, 2) COMMENT '會員消費金額',member_first_num BIGINT COMMENT '會員首單人數',member_first_order_num BIGINT COMMENT '會員首單訂單量',member_first_amount DECIMAL(27, 2) COMMENT '會員首單銷售額',member_nofirst_num BIGINT COMMENT '會員非首單人數(非去重)',member_nofirst_order_num BIGINT COMMENT '會員非首單訂單量',member_nofirst_amount DECIMAL(27, 2) COMMENT '會員非首單銷售額'
)
comment '門店會員統計月表'
partitioned by (dt STRING COMMENT '統計日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
同 ads_mem_store_member_statistics_week_i,改變下范圍即可
ADS層其他需求(基于Presto實現)
Presto--分布式SQL查詢引擎
Presto-簡介
-
背景
大數據分析類軟件發展歷程。
-
==Apache Hadoop-MapReduce==
-
優點:統一、通用、簡單的編程模型,分而治之思想處理海量數據。
-
缺點:java學習成本高、MR執行慢、內部過程繁瑣
-
-
==Apache Hive==
-
優點:SQL on Hadoop。sql語言上手方便。學習成本低。
-
缺點:底層默認還是MapReduce引擎、慢、延遲高
-
-
各種SQL類計算引擎開始出現,主要追求的就是一個問題:==計算如何更快,延遲如何降低==。
-
==Presto/trino==
-
Spark On Hive、Spark SQL
-
Flink
-
.......
-
FaceBook維護的原始版本: presto, 也叫prestoDBPresto創始人團隊離職后研發并維護的: PrestoSQL 因為版權更名為Trino已經給大家整理好了對應網址如下:FaceBook維護的, Presto的官網: https://prestodb.io/ 創始人團隊維護的, Trino的官網: https://trino.io/ Presto創始人團隊維護的, 因為版權更名為Trino: http://github.com/trinodb/trino 相關文章如下:Presto在有贊的實踐之路: https://cloud.tencent.com/developer/news/606849 Presto更名為-Trino: https://www.sohu.com/a/441836081_106784
-
-
介紹
Presto是一個開源的==分布式SQL查詢引擎==,適用于==交互式查詢==,數據量支持GB到PB字節。
Presto的設計和編寫完全是為了解決==Facebook==這樣規模的商業數據倉庫交互式分析和處理速度的問題。
presto簡介: 一條Presto查詢可以將多個數據源進行合并,可以跨越整個組織進行分析;presto特點: Presto以分析師的需求作為目標,他們期望響應速度小于1秒到幾分鐘;
-
優缺點
# 優點 1)Presto與Hive對比,都能夠處理PB級別的海量數據分析,但Presto是基于內存運算,減少沒必要的硬盤IO,所以更快。2)能夠連接多個數據源,跨數據源連表查,如從Hive查詢大量網站訪問記錄,然后從Mysql中匹配出設備信息。3)部署也比Hive簡單,因為Hive是基于HDFS的,需要先部署HDFS。# 缺點 1)雖然能夠處理PB級別的海量數據分析,但不是代表Presto把PB級別都放在內存中計算的。而是根據場景,如count,avg等聚合運算,是邊讀數據邊計算,再清內存,再讀數據再計算,這種耗的內存并不高。但是連表查,就可能產生大量的臨時數據,因此速度會變慢,反而Hive此時會更擅長。2)為了達到實時查詢,可能會想到用它直連MySql來操作查詢,這效率并不會提升,瓶頸依然在MySql,此時還引入網絡瓶頸,所以會比原本直接操作數據庫要慢。
Presto-架構、相關術語
-
架構圖
Presto是一個運行在多臺服務器上的分布式系統。 完整安裝包括==一個coordinator和多個worker==。 由客戶端提交查詢,從Presto命令行CLI提交到coordinator; coordinator進行解析,分析并執行查詢計劃,然后分發處理隊列到worker。
整個presto是一個 M-S架構 (主從架構):coordinator: 主節點 作用: 負責接收客戶端發送的SQL, 對SQL進行編譯, 形成執行計劃, 根據執行計劃, 分發給各個從節點進行執行操作 discovery service: 附屬節點作用: 一般內嵌在主節點中, 主要負責維護從節點列表, 當從節點啟動后, 都需要到 discovery 節點進行注冊操作 worker節點: 從節點作用: 負責接收coordinator傳遞過來任務, 對任務進行具體處理工作(讀取數據, 處理數據, 將處理后結果數據返回給coordinator)
-
==Connector== 連接器
1、Presto通過Connector連接器來連接訪問不同數據源,例如Hive或mysql。連接器功能類似于數據庫的驅動程序。允許Presto使用標準API與資源進行交互。2、Presto包含幾個內置連接器:JMX連接器,可訪問內置系統表的System連接器,Hive連接器和旨在提供TPC-H基準數據的TPCH連接器。許多第三方開發人員都貢獻了連接器,因此Presto可以訪問各種數據源中的數據,比如:ES、Kafka、MongoDB、Redis、Postgre、Druid、Cassandra等。
-
==Catalog== 連接目錄: hive或者mysql等數據源
1、Presto Catalog是數據源schema的上一級,并通過連接器訪問數據源。2、例如,可以配置Hive Catalog以通過Hive Connector連接器提供對Hive信息的訪問。3、在Presto中使用表時,標準表名始終是被支持的。 例如,hive.test_data.test的標準表名將引用hive catalog中test_data schema中的test table。 Catalog需要在Presto的配置文件中進行配置。
-
==schema== 庫
Schema是組織表的一種方式。Catalog和Schema共同定義了一組可以查詢的表。當使用Presto訪問Hive或關系數據庫(例如MySQL)時,Schema會轉換為目標數據庫中的對應Schema(database)。= schema通俗理解就是我們所講的database. = 想一下在hive中,下面這兩個sql是否相等。 show databases; -- presto不支持 show schemas;
-
==table== 表
...
Presto-集群啟停
[root@hadoop01 ~]# /export/server/presto/bin/launcher start Started as 89560# 可以使用jps 配合kill -9命令 關閉進程
-
web UI頁面
鏈接: http://192.168.88.80:8090/ui/
Presto-Datagrip連接使用
JDBC 驅動:==presto-jdbc-0.245.1.jar==
JDBC 地址:==jdbc:presto://192.168.88.80:8090/hive==
-
step1:創建連接
由于驅動比較大,好多人經常下載失敗,可以按照下圖關聯資料中提供的包: presto-jdbc-0.245.1.jar
Presto--時間日期類型注意事項
-
==date_format==(timestamp, format) ==> varchar
-
作用: 將指定的日期對象轉換為字符串操作
-
-
==date_parse==(string, format) → timestamp
-
作用: 用于將字符串的日期數據轉換為日期對象
select date_format( timestamp '2020-10-10 12:50:50' , '%Y/%m/%d %H:%i:%s'); select date_format( date_parse('2020:10:10 12-50-50','%Y:%m:%d %H-%i-%s') ,'%Y/%m/%d %H:%i:%s');---- 注意: 參數一必須是日期對象所以如果傳遞的是字符串, 必須將先轉換為日期對象: 方式一: 標識為日期對象, 但是格式必須為標準日期格式timestamp '2020-10-10 12:50:50'date '2020-10-10'方式二: 如果不標準,先用date_parse解析成為標準date_parse('2020-10-10 12:50:50','%Y-%m-%d %H:%i:%s') 擴展說明: 日期format格式說明年:%Y月:%m日:%d時:%H分:%i 秒:%s周幾:%w(0..6)
-
-
==date_add==(unit, value, timestamp) → [same as input]
-
作用: 用于對日期數據進行 加 減 操作
-
-
==date_diff==(unit, timestamp1, timestamp2) → bigint
-
作用: 用于比對兩個日期之間差值
select date_add('hour',3,timestamp '2021-09-02 15:59:50'); select date_add('day',-1,timestamp '2021-09-02 15:59:50'); select date_add('month',-1,timestamp '2021-09-02 15:59:50');select date_diff('year',timestamp '2020-09-02 06:30:30',timestamp '2021-09-02 15:59:50') select date_diff('month',timestamp '2021-06-02 06:30:30',timestamp '2021-09-02 15:59:50') select date_diff('day',timestamp '2021-08-02 06:30:30',timestamp '2021-09-02 15:59:50')
-
Presto-常規優化
-
數據存儲優化
--1)合理設置分區與Hive類似,Presto會根據元信息讀取分區數據,合理的分區能減少Presto數據讀取量,提升查詢性能。--2)使用列式存儲Presto對ORC文件讀取做了特定優化,因此在Hive中創建Presto使用的表時,建議采用ORC格式存儲。相對于Parquet,Presto對ORC支持更好。Parquet和ORC一樣都支持列式存儲,但是Presto對ORC支持更好,而Impala對Parquet支持更好。在數倉設計時,要根據后續可能的查詢引擎合理設置數據存儲格式。--3)使用壓縮數據壓縮可以減少節點間數據傳輸對IO帶寬壓力,對于需要快速解壓的,建議采用Snappy壓縮。--4)預先排序對于已經排序的數據,在查詢的數據過濾階段,ORC格式支持跳過讀取不必要的數據。比如對于經常需要過濾的字段可以預先排序。
-
SQL優化
-
列裁剪
-
分區裁剪
-
group by優化
-
按照數據量大小降序排列
-
-
order by使用limit
-
==join時候大表放置在左邊==
...
-
-
替換非ORC格式的Hive表
Presto-內存調優
-
內存管理機制--內存分類
Presto管理的內存分為兩大類:==user memory==和==system memory==
-
user memory用戶內存
跟用戶數據相關的,比如讀取用戶輸入數據會占據相應的內存,這種內存的占用量跟用戶底層數據量大小是強相關的
-
system memory系統內存
執行過程中衍生出的副產品,比如tablescan表掃描,write buffers寫入緩沖區,跟查詢輸入的數據本身不強相關的內存。
-
-
內存管理機制--內存池
==內存池中來實現分配user memory和system memory==。
內存池為常規內存池GENERAL_POOL、預留內存池RESERVED_POOL。
1、GENERAL_POOL:在一般情況下,一個查詢執行所需要的user/system內存都是從general pool中分配的,reserved pool在一般情況下是空閑不用的。2、RESERVED_POOL:大部分時間里是不參與計算的,但是當集群中某個Worker節點的general pool消耗殆盡之后,coordinator會選擇集群中內存占用最多的查詢,把這個查詢分配到reserved pool,這樣這個大查詢自己可以繼續執行,而騰出來的內存也使得其它的查詢可以繼續執行,從而避免整個系統阻塞。注意: reserved pool到底多大呢?這個是沒有直接的配置可以設置的,他的大小上限就是集群允許的最大的查詢的大小(query.total-max-memory-per-node)。reserved pool也有缺點,一個是在普通模式下這塊內存會被浪費掉了,二是大查詢可以用Hive來替代。因此也可以禁用掉reserved pool(experimental.reserved-pool-enabled設置為false),那系統內存耗盡的時候沒有reserved pool怎么辦呢?它有一個OOM Killer的機制,對于超出內存限制的大查詢SQL將會被系統Kill掉,從而避免影響整個presto。
-
內存相關參數
1、user memory用戶內存參數 query.max-memory-per-node:單個query操作在單個worker上user memory能用的最大值 query.max-memory:單個query在整個集群中允許占用的最大user memory2、user+system總內存參數 query.max-total-memory-per-node:單個query操作可在單個worker上使用的最大(user + system)內存 query.max-total-memory:單個query在整個集群中允許占用的最大(user + system) memory當這些閾值被突破的時候,query會以insufficient memory(內存不足)的錯誤被終結。3、協助阻止機制 在高內存壓力下保持系統穩定。當general pool常規內存池已滿時,操作會被置為blocked阻塞狀態,直到通用池中的內存可用為止。此機制可防止激進的查詢填滿JVM堆并引起可靠性問題。4、其他參數 memory.heap-headroom-per-node:這個內存是JVM堆中預留給第三方庫的內存分配,presto無法跟蹤統計,默認值是-Xmx * 0.35、結論 GeneralPool = 服務器總內存 - ReservedPool - memory.heap-headroom-per-node - Linux系統內存常規內存池內存大小=服務器物理總內存-服務器linux操作系統內存-預留內存池大小-預留給第三方庫內存
-
內存優化建議
-
常見的報錯解決
1、Query exceeded per-node total memory limit of xx 適當增加query.max-total-memory-per-node。2、Query exceeded distributed user memory limit of xx 適當增加query.max-memory。3、Could not communicate with the remote task. The node may have crashed or be under too much load 內存不夠,導致節點crash,可以查看/var/log/message。
-
建議參數設置
1、query.max-memory-per-node和query.max-total-memory-per-node是query操作使用的主要內存配置,因此這兩個配置可以適當加大。 memory.heap-headroom-per-node是三方庫的內存,默認值是JVM-Xmx * 0.3,可以手動改小一些。1) 各節點JVM內存推薦大小: 當前節點剩余內存*80%2) 對于heap-headroom-pre-node第三方庫的內存配置: 建議jvm內存的%15左右3) 在配置的時候, 不要正正好好, 建議預留一點點, 以免出現問題數據量在35TB , presto節點數量大約在30臺左右 (128GB內存 + 8核CPU) 注意: 1、query.max-memory-per-node小于query.max-total-memory-per-node。 2、query.max-memory小于query.max-total-memory。 3、query.max-total-memory-per-node 與memory.heap-headroom-per-node 之和必須小于 jvm max memory,也就是jvm.config 中配置的-Xmx。
-
ADS層開發_其他需求(Presto實現)
維度指標
需求一: 會員首次充值(統計每個會員首次充值的時間, 交易單ID以及對應門店和充值金額) 需求二: 門店新老會員消費(統計每個門店每個月新會員、老會員、全部會員、非會員的數量、消費金額、消費單量(新會員指的首次消費后30天內, 老會員指的首次消費后大于30天)) 需求三: 會員復購統計(留存)(統計的指標為統計日期用戶量、一日后用戶量、二日后用戶量、三日后用戶量、四日后用戶量、五日后用戶量、六日后用戶量) 需求四: 會員貢獻(統計各個會員每天在各個門店消費單量、消費金額、消費成本、線上訂單量、線上消費金額、線上消費成本、線下訂單量、線下消費金額、線下消費成本)涉及表:會員首次充值表 和 門店新老會員消費月表 和 會員復購統計天表 以及 會員貢獻天表
會員首次充值表
建表語句:
CREATE TABLE IF NOT EXISTS ads.ads_mem_member_first_recharge_i(trade_date_time STRING COMMENT '交易時間',trade_date STRING COMMENT '日期',trade_order_id STRING COMMENT '對應的交易單id',zt_id BIGINT COMMENT '中臺 會員id',store_no STRING COMMENT '門店編號',city_id BIGINT COMMENT '城市ID',recharge_amount DECIMAL(27, 2) COMMENT '充值金額'
)
comment '會員首次充值表'
partitioned by (dt STRING COMMENT '統計日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
說明: 同dwm_mem_first_buy_i
insert into hive.ads.ads_mem_member_first_recharge_i
with t1 as (selectdate_format(trade_date,'%Y-%m-%d %H:%i:%s') as trade_date_time,date_format(trade_date,'%Y-%m-%d') as trade_date,trade_order_id,zt_id,store_no,city_id,amount as recharge_amount,row_number() over(partition by zt_id order by trade_date) as rnfrom hive.ods.ods_mem_store_amount_record_i where record_type = 2 and date_format(trade_date,'%Y-%m-%d') = '2023-11-20'
)
selectt1.trade_date_time,t1.trade_date,t1.trade_order_id,t1.zt_id,t1.store_no,t1.city_id,t1.recharge_amount,'2023-11-20' as dt
from t1left join hive.ads.ads_mem_member_first_recharge_i ton t1.zt_id = t.zt_id and t1.store_no = t.store_no and t.dt < '2023-11-20'
where rn = 1 and t.zt_id is null;
門店新老會員消費月表
建表語句:
CREATE TABLE IF NOT EXISTS ads.ads_mem_store_new_old_member_month_i(trade_date STRING COMMENT '月一時間',store_no STRING COMMENT '店鋪編碼',store_name STRING COMMENT '店鋪名稱',store_sale_type BIGINT COMMENT '店鋪銷售類型',store_type_code BIGINT COMMENT '分店類型',city_id BIGINT COMMENT '城市ID',city_name STRING COMMENT '城市名稱',region_code STRING COMMENT '區域編碼',region_name STRING COMMENT '區域名稱',is_day_clear BIGINT COMMENT '是否日清:0否,1是',member_type BIGINT COMMENT '會員類型:1新會員,2老會員,3會員,4非會員',member_num BIGINT COMMENT '消費會員數',sale_amount DECIMAL(27, 2) COMMENT '消費金額',order_num BIGINT COMMENT '消費單量'
)
comment '門店新老會員消費月表'
partitioned by (dt STRING COMMENT '消費日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
新會員:首次消費后30天內的; 老會員:首次消費后大于30天;需要統計每個門店每個月新會員、老會員、全部會員、非會員的數量、消費金額、消費單量。 注意:這里是一個月表,在判斷新老會員的時候,按照當月最后一天為標準,往前推30天,30天內的為新會員。比如今天是5月28日,在計算5月份的數據時,4月29日——5月28日這30天的都是新會員。而在計算4月份數據時,因為4月份已經過去了,所以以4月30日為最后一天,4月1日——4月30日為4月份的新會員。
1)在計算月表時,需要取到當月最后一天,然后以最后一天為標準,取到前30天
selecta.trade_date,date_sub(a.trade_date, 30) as day30 -- 前30天,a.month_trade_date -- 對應的月一時間
from dim.dwd_dim_date_f a
inner join(select max(dt) mdt from dws.dws_mem_store_member_statistics_day_i -- 取到最大的分區where dt>=date_sub('${inputdate}', dayofmonth('${inputdate}') - 1)and dt<=last_day('${inputdate}') ) b
on a.trade_date = b.mdt
2)新會員:30天內首次消費的會員
取新會員,可以使用首次消費表,取前30天到當月最大一天的會員即可。
因為在hive的where語句中不能使用子查詢,所以這里使用join的方式解決
with dtt as (selecta.trade_date,date_sub(a.trade_date, 30) as day30 -- 前30天,a.month_trade_date -- 對應的月一時間from dim.dwd_dim_date_f ainner join(select max(dt) mdt from dws.dws_mem_store_member_statistics_day_i -- 取到最大的分區where dt>=date_sub('${inputdate}', dayofmonth('${inputdate}') - 1)and dt<=last_day('${inputdate}') ) bon a.trade_date = b.mdt),
zt as (select s.zt_id from dwm.dwm_mem_first_buy_i s -- 取到最大分區與其前30天的數據cross join dttwhere s.dt >= dtt.day30 and s.dt <= dtt.trade_date)
代碼實現:
insert into hive.ads.ads_mem_store_new_old_member_month_i
with t1 as (selecttrade_date,date_format(date_add('day',-30,date '2023-11-20'),'%Y-%m-%d') as day30,month_trade_date,month_end_datefrom hive.dim.dwd_dim_date_f where trade_date = (selectmax(dt)from hive.dws.dws_mem_store_member_statistics_day_iwhere dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d'))
),
t2 as (-- 獲取最近30天有過消費的新用戶selecttemp1.trade_date,temp1.zt_id,temp1.store_nofrom hive.dwm.dwm_mem_first_buy_i temp1cross join t1where temp1.dt >= t1.day30 and temp1.dt <= t1.trade_date
),
t3 as (-- 獲取 最近30天新用戶的消費select1 as member_type,temp2.bind_md as store_no,count(distinct temp2.zt_id) as member_num,sum(consume_amount) as sale_amount,sum(consume_times) as order_numfrom hive.dwm.dwm_mem_member_behavior_day_i temp2join t2 on temp2.zt_id = t2.zt_id and temp2.bind_md = t2.store_nowhere dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')and consume_times > 0group bytemp2.bind_mdunion all-- 獲取 老會員select2 as member_type,temp2.bind_md as store_no,count(distinct temp2.zt_id) as member_num,sum(consume_amount) as sale_amount,sum(consume_times) as order_numfrom hive.dwm.dwm_mem_member_behavior_day_i temp2left join t2 on temp2.zt_id = t2.zt_id and temp2.bind_md = t2.store_nowhere dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')and consume_times > 0 and t2.zt_id is nullgroup bytemp2.bind_mdunion all-- 獲取 全部會員select3 as member_type,temp2.bind_md as store_no,count(distinct temp2.zt_id) as member_num,sum(consume_amount) as sale_amount,sum(consume_times) as order_numfrom hive.dwm.dwm_mem_member_behavior_day_i temp2where dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')and consume_times > 0group bytemp2.bind_mdunion all-- 非會員數據select4 as member_type,store_no,0 as member_num,sum(real_paid_amount) as sale_amount,count(if(trade_type = 0,parent_order_no,NULL)) - count(if(trade_type = 5,parent_order_no,NULL)) as order_numfrom hive.dwm.dwm_sell_o2o_order_iwhere dt >= date_format(date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20'),'%Y-%m-%d')and dt < date_format(date_add('month',1,date_add('day', -day(date '2023-11-20') + 1 ,date '2023-11-20')),'%Y-%m-%d')and member_type = 0group by store_no
)
selectt1.month_trade_date as trade_date,t3.store_no,t4.store_name,t4.store_sale_type,t4.store_type_code,t4.city_id,t4.city_name,t4.region_code,t4.region_name,t4.is_day_clear,t3.member_type,t3.member_num,cast(t3.sale_amount as decimal(27,2)),t3.order_num,t1.month_trade_date as dt
from t3 cross join t1-- 注意: 一定要檢查自己的dwd_dim_store_i分區目錄,此處填寫自己的分區目錄時間left join hive.dim.dwd_dim_store_i t4 on t3.store_no = t4.store_no and t4.dt = '2023-11-23';
會員復購統計天表
建表語句:
CREATE TABLE IF NOT EXISTS ads.ads_mem_repurchase_day_i(trade_date STRING COMMENT '統計時間',store_no STRING COMMENT '店鋪編碼',store_name STRING COMMENT '店鋪名稱',store_sale_type BIGINT COMMENT '店鋪銷售類型',store_type_code BIGINT COMMENT '分店類型',city_id BIGINT COMMENT '城市ID',city_name STRING COMMENT '城市名稱',region_code STRING COMMENT '區域編碼',region_name STRING COMMENT '區域名稱',is_day_clear BIGINT COMMENT '是否日清:0否,1是',member_count BIGINT COMMENT '統計日期用戶量',next_member_count_1 BIGINT COMMENT '一日后用戶量',next_member_count_2 BIGINT COMMENT '二日后用戶量',next_member_count_3 BIGINT COMMENT '三日后用戶量',next_member_count_4 BIGINT COMMENT '四日后用戶量',next_member_count_5 BIGINT COMMENT '五日后用戶量',next_member_count_6 BIGINT COMMENT '六日后用戶量'
)
comment '會員復購統計天表'
partitioned by (dt STRING COMMENT '消費日期')
row format delimited fields terminated by ','
stored as orc
tblproperties ('orc.compress'='SNAPPY');
數據導入:
復購是一個非常重要的指標,用來衡量客戶的粘性。這個需求需要統計當天下單的用戶,一日、二日到六日的復購情況,為了方便后續使用,這里不直接統計復購率,而是統計人數。所以,這個需求需要統計的指標為統計日期用戶量、一日后用戶量、二日后用戶量、三日后用戶量、四日后用戶量、五日后用戶量、六日后用戶量。本需求類似于計算留存,也就是統計當天的用戶,在1日、2日、3日。。。之后是否再次購買。
使用dwm_mem_member_behavior_day_i表進行計算。因為需求中最多需要計算六日后的用戶量,所以當天中的這些消費用戶,需要6天之后,才能拿到所有的數據(1日后,2日后...6日后)。換個角度看,只有6天前的數據才會穩定,6天內的分區對應的數據每天都要進行更新,所以,每天要更新6個分區的數據。如果計算n天后的復購人數,其實就是用第一天的會員與第n天的會員進行關聯,這里使用左關聯,關聯條件為會員id以及日期 能關聯上的,即是復購的用戶,然后再count()則可得到相應數值。
代碼實現:
-- 六天前消費用戶和往后每一天的復購情況--會員主題: ADS層 會員復購天表
-- 需求: 計算某一天及相對于第一天往后六天每天的復購的人數
-- 思路: 首先知道6天前的那一天的所有的消費用戶 基于這個結果 left join 往后1天的所有消費用戶 left join 往后2天的所有消費用戶 ...往后6天的所有消費用戶
selectdate_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d') as trade_date,s.store_no,s.store_name,s.store_sale_type,s.store_type_code,s.city_id,s.city_name,s.region_code,s.region_name,s.is_day_clear,count(day0.zt_id) as member_count,count(day1.zt_id) as next_member_count_1,count(day2.zt_id) as next_member_count_2,count(day3.zt_id) as next_member_count_3,count(day4.zt_id) as next_member_count_4,count(day5.zt_id) as next_member_count_5,count(day6.zt_id) as next_member_count_6,date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d') as dt
from (selectt.zt_id,t.bind_md,t.dt as after,tt.days_after1,tt.days_after2,tt.days_after3,tt.days_after4,tt.days_after5,tt.days_after6from hive.dwm.dwm_mem_member_behavior_day_i t left join hive.dim.dwd_dim_date_f tt on t.trade_date = tt.trade_datewhere dt = date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) as day0
left join (selectzt_id,bind_md,dt as after1from hive.dwm.dwm_mem_member_behavior_day_iwhere dt = date_format(date_add('day',-5,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day1 on day0.days_after1 = day1.after1 and day0.zt_id = day1.zt_id
left join (selectzt_id,bind_md,dt as after2from hive.dwm.dwm_mem_member_behavior_day_iwhere dt = date_format(date_add('day',-4,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day2 on day0.days_after2 = day2.after2 and day0.zt_id = day2.zt_id
left join (selectzt_id,bind_md,dt as after3from hive.dwm.dwm_mem_member_behavior_day_iwhere dt = date_format(date_add('day',-3,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day3 on day0.days_after3 = day3.after3 and day0.zt_id = day3.zt_id
left join (selectzt_id,bind_md,dt as after4from hive.dwm.dwm_mem_member_behavior_day_iwhere dt = date_format(date_add('day',-2,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day4 on day0.days_after4 = day4.after4 and day0.zt_id = day4.zt_id
left join (selectzt_id,bind_md,dt as after5from hive.dwm.dwm_mem_member_behavior_day_iwhere dt = date_format(date_add('day',-1,date '2023-09-20'),'%Y-%m-%d') and consume_times>0
) day5 on day0.days_after5 = day5.after5 and day0.zt_id = day5.zt_id
left join (selectzt_id,bind_md,dt as after6from hive.dwm.dwm_mem_member_behavior_day_iwhere dt = '2023-09-20' and consume_times>0
) day6 on day0.days_after6 = day6.after6 and day0.zt_id = day6.zt_id
join hive.dim.dwd_dim_store_i s on day0.bind_md = s.store_no and s.dt = '2023-09-24'
group bys.store_no,s.store_name,s.store_sale_type,s.store_type_code,s.city_id,s.city_name,s.region_code,s.region_name,s.is_day_clear;另一種寫法: 直接計算出 6天 及其每一天和后面六天的數據
--會員主題: ADS層 會員復購天表
-- 需求: 計算某一天及相對于第一天往后六天每天的復購的人數
-- 思路: 首先知道6天前的那一天的所有的消費用戶 基于這個結果 left join 往后1天的所有消費用戶 left join 往后2天的所有消費用戶 ...往后6天的所有消費用戶
selectday0.after as trade_date,s.store_no,s.store_name,s.store_sale_type,s.store_type_code,s.city_id,s.city_name,s.region_code,s.region_name,s.is_day_clear,count(day0.zt_id) as member_count,count(day1.zt_id) as next_member_count_1,count(day2.zt_id) as next_member_count_2,count(day3.zt_id) as next_member_count_3,count(day4.zt_id) as next_member_count_4,count(day5.zt_id) as next_member_count_5,count(day6.zt_id) as next_member_count_6,date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d') as dt
from (-- 獲取 統計日期前6天的所有的消費數據selectt.zt_id,t.bind_md,t.dt as after,tt.days_after1,tt.days_after2,tt.days_after3,tt.days_after4,tt.days_after5,tt.days_after6from hive.dwm.dwm_mem_member_behavior_day_i t left join hive.dim.dwd_dim_date_f tt on t.trade_date = tt.trade_datewhere dt >= date_format(date_add('day',-6,date '2023-09-20'),'%Y-%m-%d')and dt <= '2023-09-20'and consume_times>0
) as day0
left join (-- 獲取 統計日期前5天和 后1天的的所有的消費數據selectzt_id,bind_md,dt as after1from hive.dwm.dwm_mem_member_behavior_day_iwhere dt >= date_format(date_add('day',-5,date '2023-09-20'),'%Y-%m-%d')and dt <= date_format(date_add('day',1,date '2023-09-20'),'%Y-%m-%d')and consume_times>0
) day1 on day0.days_after1 = day1.after1 and day0.zt_id = day1.zt_id
left join (-- 獲取 統計日期前4天和 后2天的的所有的消費數據selectzt_id,bind_md,dt as after2from hive.dwm.dwm_mem_member_behavior_day_iwhere dt >= date_format(date_add('day',-4,date '2023-09-20'),'%Y-%m-%d')and dt <= date_format(date_add('day',2,date '2023-09-20'),'%Y-%m-%d')and consume_times>0
) day2 on day0.days_after2 = day2.after2 and day0.zt_id = day2.zt_id
left join (-- 獲取 統計日期前3天和 后3天的的所有的消費數據selectzt_id,bind_md,dt as after3from hive.dwm.dwm_mem_member_behavior_day_iwhere dt >= date_format(date_add('day',-3,date '2023-09-20'),'%Y-%m-%d')and dt <= date_format(date_add('day',3,date '2023-09-20'),'%Y-%m-%d')and consume_times>0
) day3 on day0.days_after3 = day3.after3 and day0.zt_id = day3.zt_id
left join (-- 獲取 統計日期前2天和 后4天的的所有的消費數據selectzt_id,bind_md,dt as after4from hive.dwm.dwm_mem_member_behavior_day_iwhere dt >= date_format(date_add('day',-2,date '2023-09-20'),'%Y-%m-%d')and dt <= date_format(date_add('day',4,date '2023-09-20'),'%Y-%m-%d')and consume_times>0
) day4 on day0.days_after4 = day4.after4 and day0.zt_id = day4.zt_id
left join (-- 獲取 統計日期前1天和 后5天的的所有的消費數據selectzt_id,bind_md,dt as after5from hive.dwm.dwm_mem_member_behavior_day_iwhere dt >= date_format(date_add('day',-1,date '2023-09-20'),'%Y-%m-%d')and dt <= date_format(date_add('day',5,date '2023-09-20'),'%Y-%m-%d')and consume_times>0
) day5 on day0.days_after5 = day5.after5 and day0.zt_id = day5.zt_id
left join (-- 獲取 統計日期后6天的的所有的消費數據selectzt_id,bind_md,dt as after6from hive.dwm.dwm_mem_member_behavior_day_iwhere dt >= '2023-09-20'and dt <= date_format(date_add('day',6,date '2023-09-20'),'%Y-%m-%d')and consume_times>0
) day6 on day0.days_after6 = day6.after6 and day0.zt_id = day6.zt_id
join hive.dim.dwd_dim_store_i s on day0.bind_md = s.store_no
group byday0.after,s.store_no,s.store_name,s.store_sale_type,s.store_type_code,s.city_id,s.city_name,s.region_code,s.region_name,s.is_day_clear;