進入
./beeline -u jdbc:hive2://node2:10000 -n root -p
查詢
SHOW TABLES;
刪除
DROP TABLE IF EXISTS tidanic;
上傳數據
hdfs dfs -put train.csv /user/hive/warehouse/mytrain.db/tidanic
《泰坦尼克號幸存者數據分析》
1、原始數據介紹
泰坦尼克號是當時世界上體積最龐大、內部設施最豪華的客運輪船,有“永不沉沒”的美譽。然而不幸的是,在它的處女航中,泰坦尼克號便遭厄運——它從英國南安普敦出發駛向美國紐約。
(1)列名介紹
PassengerID->乘客ID
Survived->是否生還
Pclass->船艙級別
Name->姓名
Sex->性別
Age->年齡
SibSp->兄弟姐妹與配偶的總數
Parch->父母和孩子的總數
Ticket->船票ID
Fare->票價
Cabin->艙室
Embarked->出發港口
(2)經過數據清洗后字段之間分隔符為‘\t’, 集合之間分隔符為‘,’ 數據數目:891 條 創建原始表 tidanic
2、 創建數據庫并進入數據庫
create database if not exists mytrain;use mytrain;
3、創建源表
create table tidanic(
passengerid int,
survived int,
pclass int,
name string,
sex string,
age int,
sibsp int,
parch int,
ticket string,
fare double,
cabin String,
embarked String)row format delimited fields terminated by ',';
(1)通過HDFS命令導入數據到指定路徑。
hdfs dfs -put train.csv /user/hive/warehouse/mytrain.db/tidanic
(2)查看前5行,檢查是否導入成功。
select * from tidanic limit 5;
4、靜態分區表
(1)創建靜態分區表tidanic_part,字段為passengerid,survived,pclass,name,
分區字段為gender,按照性別字段sex分區。
create table tidanic_part(passengerid int,survived int,pclass int,name string)partitioned by(gender string)row format delimited fields terminated by ',';
(2)導入數據到靜態分區表tidanic_part
insert overwrite table tidanic_part partition(gender='female')select passengerid,survived,pclass,name from tidanic where sex='female';insert overwrite table tidanic_part partition(gender='male')select passengerid,survived,pclass,name from tidanic where sex='male';
5、動態分區表
(1)創建動態分區表tidanic_dynamic_part,字段為passengerid,survived,name,
分區字段為passengerclass,按照pclass值進行分區。
create table tidanic_dynamic_part(passengerid int,survived int,name string)partitioned by(passengerclass string)row format delimited fields terminated by ',';
(2)設置動態分區配置
set hive.exec.dynamic.partition=true;set hive.exec.dynamic.partition.mode=nostrict;
(3)往動態分區表中插入數據
insert overwrite table tidanic_dynamic_part partition(passengerclass)select passengerid,survived,name,pclass from tidanic;
6、分桶表
(1)創建桶表,按年齡將數據分到4個桶,抽取兩個桶的數據創建一個新表tidannic_sample。
create table tidanic_bucket(passengerid int,name string,age int)clustered by (age) into 4 bucketsrow format delimited fields terminated by ',';
(2)修改桶表配置
set hive.enforce.bucketing=true;
(3)往桶表中插入數據
insert overwrite table tidanic_bucket select passengerid,name,age from tidanic;
(4)抽取桶1開始兩個桶的數據到抽樣表tidanic_sample中,
create table tidanic_sample as select * from tidanic_bucket tablesample(bucket 1 out of 2 on age);
7、數據導出
將分區標數據導出到文件夾‘/export_dir2’
export table tidanic_dynamic_part to '/user/hive/export_dir2';
8、外部表
(1)創建外部表,位置位于’/user/hive/warehouse/titanic_external’,字符之間’,'隔開
CREATE EXTERNAL TABLE titanic_external (
passengerid int,
survived int,
pclass int,
name string,
sex string,
age int,
sibsp int,
parch int,
ticket string,
fare double,
cabin String,
embarked String
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/titanic_external';
(2)載入數據
LOAD DATA INPATH '/train.csv' INTO TABLE titanic_external;
(3)查看表的結構
DESCRIBE FORMATTED titanic_external;
9、DDL操作
①顯示表名t的數據表
②將數據庫中titanic_external表的名字改為titanic_ex;
③刪除數據表titanic_ex;
DROP TABLE titanic_ex;
10、查詢插入
所有年齡大于等于 20 歲的乘客數據插入到另一個表 titanic_cc中
INSERT INTO TABLE titanic_cc
SELECT *
FROM titanic_external
WHERE Age >= 20;
11、分組過濾排序查詢
(1)過濾查詢(WHERE):查詢所有幸存下來的男性乘客。
SELECT * FROM tidanic WHERE sex = 'male' AND survived = 1;
(2)分組查詢(GROUP BY):按船票等級(pclass)統計乘客數。
SELECT pclass, COUNT(*) AS num_passengers FROM tidanic GROUP BY pclass;
(3)排序查詢(ORDER BY):按船票費用(fare)從高到低排序乘客。
SELECT * FROM tidanic ORDER BY fare DESC;
(4)組合過濾、分組和排序:查詢所有幸存下來的女性乘客,并按年齡(age)從低到高排序。
SELECT * FROM tidanic WHERE sex = 'female' AND survived = 1 ORDER BY age ASC;
(5)內置函數 - 數學函數:計算乘客年齡的標準差。
SELECT STDDEV(Age) AS age_stddev
FROM tidanic;
(6)內置函數 - 條件函數: 使用CASE語句將乘客分為成年人和未成年人,并計算各自的數量。
SELECTSUM(CASE WHEN Age >= 18 THEN 1 ELSE 0 END) AS adult_count,SUM(CASE WHEN Age < 18 THEN 1 ELSE 0 END) AS minor_count
FROM tidanic;
12、抽樣查詢
從tidanic中隨機選擇大約10%的行
SELECT * FROM tidanic TABLESAMPLE(BUCKET 1 OUT OF 10 ON RAND()) s;
13、事務表
開啟事務
set hive.support.concurrency = true;
set hive.enforce.bucketing = true;
set hive.exec.dynamic.partition.mode = nonstrict;
set hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.compactor.initiator.on = true;
set hive.compactor.worker.threads = 1;
創建表
CREATE TABLE titanic_transactional ( passengerid int,
survived int,
pclass int,
name string,
sex string,
age int,
sibsp int,
parch int,
ticket string,
fare double,
cabin String,
embarked String) STORED AS ORC TBLPROPERTIES ('transactional'='true');
從原表把數據插入事務表
INSERT INTO TABLE titanic_transactional SELECT * FROM tidanic;
更新所有年齡大于60的乘客的survived字段為0(表示未幸存)
UPDATE titanic_transactional SET survived = 0 WHERE age > 60;
SELECT * FROM titanic_transactional WHERE age < 20;
刪除所有年齡小于20的乘客記錄
DELETE FROM titanic_transactional WHERE age < 20;
SELECT * FROM titanic_transactional WHERE age < 20;
分析與總結:
使用python把篩選出的數據進行數據分析可得到如下
- 女性幸存率約為75%,遠高于男性的20%左右。這表明在緊急情況下,女性更容易得到救援。
- 頭等艙乘客的幸存率最高,達到了63%,而三等艙乘客的幸存率最低,僅為24%。這表明社會地位和經濟條件對幸存率有顯著影響。
- 在各個船艙等級中,女性的幸存率均高于男性。然而,頭等艙男性的幸存率仍然高于三等艙女性的幸存率,這進一步強調了社會地位對幸存率的重要性。
- 與家人同行的乘客往往更容易幸存,因為他們可以相互幫助和照顧。