原本這是篇給公司內同事寫的培訓文章,對于初學awk的人還蠻有幫助,貼到這里與大家共享一下。
〇、前言
??? 意見反饋,請mailto:datouwang@gmail.com。
?
一、AWK簡介
???????? AWK名字來源于三位創造者Aho、Weinberger和Kernighan統稱。
???????? AWK擅長處理文本數據。
?
二、AWK的調用方式
???????? awk [-Ffs] [-v var=value] [program | -f progfile ...] [file ...]
?
??? 1、命令行方式
???????? 例如:
????????
awk '{print $1}' file ps -ef|grep program|awk '{print $2}' |
?
?
???????? 2、文件方式
???????? 例如:
awk -f progfile file |
?
????????
???????? 3、文件解釋器方式
???????? AWK腳本文件開頭需要注明調用方式,典型寫法為:
???????? #!/bin/awk -f
??? 注意-f后面有空格。
???????? 腳本文件需要有執行權限,如果沒有需要使用chmod +x progfile賦權。
???????? 例如:
progfile file |
?
三、AWK參數
???????? -F? 指定域分隔符,例如:-F "|",即以|作為域分隔符,默認分隔符為一個或多個空格或TAB,即"[[:space:]][[:space:]]*"。
???????? -v? 定義變量,從shell給awk傳遞變量,如-vDATE=$DATE,即將shell中$DATE變量值傳遞給awk變量DATE。
???????? -f? 指定腳本文件,例如-f progfile。
?
四、AWK內置變量
???????? FS????? 域分隔符
???????? NF????? 域個數
???????? NR????? 行數
???????? FNR???? 同上
???????? FILENAME??? 處理的文件名,當輸入為管道時,FILENAME為空。
???????? RS????? 行分隔符
???????? OFS???? 輸出域分隔符
???????? ORS???? 輸出行分隔符
???????? OFMT??????? 數字輸出格式
???????? CONVFMT???? 數字內部轉換格式
???????? SUBSEP????? 多維數組索引分隔符
???????? ARGC??????? 輸入參數個數
???????? ARGV??????? 輸入參數數組
???????? ENVIRON???? 環境變量數組
???????? RSTART????? match()函數正則匹配到字符串開始位置
???????? RLENGTH???? match()函數正則匹配到字符串的長度
?
五、AWK內置函數
???????? blength[([s])]????????? 計算字符串長度(byte為單位)
??? length[([s])]?????????? 計算字符串長度(character為單位)
??? rand()????????????? 生成隨機數
???????? srand([expr])?????????? 設置rand() seed
??? int(x)????????????? 字符串轉換為整型
???????? substr(s, m [, n])????? 取子字符串
???????? index(s, t)???????? 在字符串s中定位t字符串首次出現的位置
???????? match(s, ere)?????????? 在字符串s中匹配正則ere,match修改RSTART、RLENGTH變量。
???????? split(s, a[, fs])?????? 將字符串分割到數組中
???????? sub(ere, repl [, in])?? 字符串替換
???????? gsub??????????????? 同上
???????? sprintf(fmt, expr, ...) 拼字符串
???????? system(cmd)???????? 在shell中執行cmd。
???????? toupper(s)????????? 字符串轉換為大寫
???????? tolower(s)????????? 字符串轉換為小寫
?
?
六、AWK流程控制
???????? if(expression) statement [ else statement ]
??? while(expression) statement
??? for(expression;expression;expression) statement
??? for(var in array) statement
??? do statement while(expression)
??? break
??? continue
??? {[statement? ...]}
??? expression????????? # commonly? var = expression
??? print [expression-list] [ > expression]
??? printf format [, expression-list] [ > expression]
??? return [expression]
??? next??????????????? # skip remaining patterns on this input line.
??? delete array [expression]?? # delete an array element.
??? exit [expression]?? # exit immediately; status is expression.
?
七、AWK簡單應用范例
???????? AWK腳本分為三部分BEGIN段,處理段,END段。其中BEGIN段在第一行讀取之前執行,END段在最后一行處理后執行。
?
1、內容過濾,同"grep tag file"。
#前兩個語句為正則匹配 ? awk '/tag/ {print}' file awk '{if($0 ~/tag/) print}' file awk '{if(index($0, "tag") > 0) print}' file ? |
?
2、取特定列,同"cut –f1 –f3 –f5 file"。
#輸出文件第1、3、5列 ? awk '{print $1, $3, $5}' file ? |
????????
3、對文件內容進行剔重,類似"sort -u file",但未排序。
#如果當前行未存在于rec HASH表中,則記錄此行數據,并輸出 ? awk '{if(!($0 in rec)) {rec[$0]=1; print $0;}}' file ? |
???????? AWK中數組有兩種用法普通數組和HASH數組,此處為HASH數組。
????????
?
4、僅輸出數據
#輸出100行數據 ? awk ‘BEGIN {for(i = 0; i < 100; i++) printf("this is %d\n", i);}’ ? |
可見,如果腳本中只有BEGIN段,可以沒有輸入。
?
5、統計數據
#對第一列和第二列數據進行匯總,最終輸出 ? awk ‘{a+=$1; b+=$2}END{printf("a=%d\n,b=%d\n", a, b);}’ file ? |
?
?
八、AWK高級應用范例
?
1、 分組功能,類似Group by功能
#使用第一列作為分組列,第二列為聚合列,即select col1, sum(col2) from file group by col1 ? awk ‘{tot[$1] += $2}END{for(i in tot) printf("%s %d\n", i, tot[i]);}’ file ? #比上個例子增加一個類似having的用法 ? awk ‘{tot[$1] += $2}END{for(i in tot) if(tot[i] > 10) printf("%s %d\n", i, tot[i]);}’ file ? #使用第一列作為分組列,第二列、第三列為聚合列 ? awk ‘{tot1[$1] += $2; tot2[$1] += $3;}END{for(i in tot1) printf("%s %d %d\n", i, tot1[i], tot2[i]);}’ file ? #多維數組例子,可將多個字段作為分組列,AWK使用一維數組模擬多維數組,使用\034作為分隔符 ? awk ‘{tot1[$1, $2] += $3; tot2[$1, $2] += $4;}END{for(i in tot1) printf("%s %d %d\n", i, tot1[i], tot2[i]);}’ file? ? ? |
?
2、 文件操作
#將兩個文件根據filename1的第一列和filename2的第二列進行關聯 ? BEGIN { ??????? #讀取filename1文件內容 ??????? while((getline < "filename1") != NULL) ??????? { ??????????????? rel[$1] = 1; ??????????????? rec1[$1] = $2; ??????? } ? ??????? while((getline < "filename2") != NULL) ??????? { ??????????????? rel[$2] = 1; ??????????????? rec2[$2] = $3; ??????? } ? ??????? for(i in rel) ??????? { ??????????????? printf("%s %s %s\n", i, rec1[i], rec2[i]); ??????? } } ? #將文件按照字段進行拆分 ? { ??????? print $0 >> "split/" substr($1,1,7); } ? |
?
3、 從SHELL向AWK傳遞變量
awk? -vAWK_DATE=$DATE 'BEGIN {print AWK_DATE}' |
?
?
4、 在AWK內部讀取shell命令輸出
#讀取ls命令輸出,在AWK中打印輸出 ? BEGIN { ??????? while("ls"|getline) ??????? { ??????????????? print $0; ??????? } } ? #讀取date輸出 ? BEGIN { ??????? "date" | getline; ? ??????? print $0; ? ??????? "date +%Y" | getline v_year; ? ??????? print v_year; } ? |
?
?
5、 將AWK輸出通過管道傳遞給SHELL命令
#將打印信息輸出給sort進行排序 ? BEGIN { ??????? for(i = 0; i < 100; i++) ??????? { ??????????????? printf("%03d\n", 100 - i) | "sort"; ??????? } } |
?
?
6、 正則表達式簡單例子
#演示正則表達式的使用方法 ? BEGIN { ??????? str1 = "abc.123@gmail.com"; ??????? str2 = "123123abcd@gmail.com"; ??????? str3 = "&^%76123@gmail.com"; ??????? str4 = "zxcvb@sohu.com.cn.1231231"; ? ??????? match(str1, "[a-zA-Z][a-zA-Z0-9.]*@[a-zA-Z0-9][a-zA-Z0-9.]*.[a-zA-Z]*[a-zA-Z]"); ? ??????? if(RSTART > 0) ??????????????? printf("%s\n", substr(str1, RSTART, RLENGTH)); ??????? else ??????????????? printf("[%s] not match\n", str1); ? ??????? match(str2, "[a-zA-Z][a-zA-Z0-9.]*@[a-zA-Z0-9][a-zA-Z0-9.]*.[a-zA-Z]*[a-zA-Z]"); ? ??????? if(RSTART > 0) ??????????????? printf("%s\n", substr(str2, RSTART, RLENGTH)); ??????? else ??????????????? printf("[%s] not match\n", str2); ? ??????? match(str3, "[a-zA-Z][a-zA-Z0-9.]*@[a-zA-Z0-9][a-zA-Z0-9.]*.[a-zA-Z]*[a-zA-Z]"); ? ??????? if(RSTART > 0) ??????????????? printf("%s\n", substr(str3, RSTART, RLENGTH)); ??????? else ??????????????? printf("[%s] not match\n", str3); ? ??????? match(str4, "[a-zA-Z][a-zA-Z0-9.]*@[a-zA-Z0-9][a-zA-Z0-9.]*.[a-zA-Z]*[a-zA-Z]"); ? ??????? if(RSTART > 0) ??????????????? printf("%s\n", substr(str4, RSTART, RLENGTH)); ??????? else ??????????????? printf("[%s] not match\n", str4); ? ? } |
?
?
7、自定義函數
function my_plus(a, b) { ??????? return a + b; } BEGIN { ??????? printf("%d\n", my_plus(123, 321)); } |
?
?
九、一些應用范例
???????? 1、驗證話單正確性的一個腳本
/^vc/ { ??????? #取話單中各個變量 ??????? call_type = substr($0,3,2); ??????? call_duration = int(substr($0,95,6)); ??????? roam_type = substr($0,210,1); ??????? fee_type = substr($0,211,1); ??????? dial_type = substr($0,212,3); ??????? chat_type = substr($0,215,3); ? ??????? cfee = int(substr($0,218,9)); ??????? lfee = int(substr($0,236,9)); ? ??????? #如果為國際漫游,不分析,跳過 ??????? if(roam_type > 4) ??????? { ??????????????? next; ??????? } ? ??????? if(call_type == "01") ??????? { ??????????????? if(substr(dial_type,1,1) != "0") ??????????????? { ???????????????? ???????if(lfee > 0) ??????????????????????? { ??????????????????????????????? printf("%s:LFEE_01\n", $0); ??????????????????????? } ? ??????????????????????? next; ??????????????? } ? ??????????????? if(roam_type != "0") ??????????????? { ?????????????????? ?????if(fee_type == "0" || fee_type == "2" || fee_type == "3") ??????????????????????? { ??????????????????????????????? if(lfee > 0) ??????????????????????????????? { ??????????????????????????????????????? printf("%s:LFEE_ERR02\n", $0) ??????????????????????????????? } ??????????????????????? } ??????????????????????? else ??????????????????????? { ??????????????????????????????? if(cfee > 0) ??????????????????????????????? { ??????????????????????????????????????? printf("%s:CFEE_ERR01\n", $0); ??????????????????????????????? } ??????????????????????? } ??????????????? } ??????????????? else ??????????????? { ??????????????????????? if(fee_type != "0") ??????????????????????? { ??????????????????????????????? if(cfee > 0) ?????????????? ?????????????????{ ??????????????????????????????????????? printf("%s:CFEE_ERR02\n", $0); ??????????????????????????????? } ??????????????????????? } ??????????????? } ??????? } ? ??????? if(call_type == "02") ??????? { ??????????????? if(lfee > 0) ??????? ????????{ ??????????????????????? printf("%s:LFEE_ERR03\n", $0); ??????????????? } ??????? } } |
?
???????? 2、一個模擬求取批價標批費率計劃的例子
function my_match(str, pat) { #for debug #printf("str==>|%s|,pat==>|%s|\n", str, pat); ? ??????? if(pat == "*") ??????????????? return 1; ? ??????? n = split(pat, arr, ","); ? ??????? for(z = 1; z <= n; z++) ??????? { ??????????????? gsub("\?", "[a-zA-Z0-9]", arr[z]); ? #for debug #printf("str==|%s|,arr==>|%s|\n", str, arr[z]); ??????????????? ??????????????? match(str, arr[z]); ??????????????? if(RSTART > 0) ??????????????? { ??????????????????????? return 1; ??????????????? } ??????? } ? ??????? return 0; ??????? } ? BEGIN { ? ??????? dial_cnt = 0; ??????? while((getline < "dial.lst") != NULL) ??????? { ??????????????? dial[dial_cnt] = $1; ? ??????????????? dial_cnt++; ??????? } ? ??????? chat_cnt = 0; ??????? while((getline < "chat.lst") != NULL) ??????? { ??????????????? chat[chat_cnt] = $1; ? ??????????????? chat_cnt++; ??????? } ? ??????? cfg_cnt = 0; ? ??????? while((getline < "plan.lst") != NULL) ??????? { ??????????????? cfg_dial[cfg_cnt] = $1; ??????????????? cfg_chat[cfg_cnt] = $2; ??????????????? cfg_item[cfg_cnt] = $3; ??????????????? cfg_plan[cfg_cnt] = $4; ? ??????????????? cfg_cnt++; ??????? } ? ??????? for(d = 0; d < dial_cnt; d++) ??????? { ??????????????? for(c = 0; c < chat_cnt; c++) ??????????????? { ??????????????????????? printf("%s %s|", dial[d], chat[c]); ??????????????????????? ??????????????????????? out_cnt = 0; ?????????????????????? ? ??????????????????????? for(i = 0; i < cfg_cnt; i++) ??????????????????????? { #for debug #printf("\n<%d,%d,%d>test match==>|<%s, %s>; <%s, %s>|\n", d, c, i, dial[d], cfg_dial[i], chat[c], cfg_chat[i]); ? ??????????????????????????????? if(my_match(dial[d], cfg_dial[i]) == 1 && my_match(chat[c], cfg_chat[i]) == 1) ??????????????????????????????? { ??????????????????????????????????????? if(out_cnt == 0) ??????????????????????????????????????? { ??????????? ????????????????????????????????????printf("%s %s %s %s\n", cfg_item[i], cfg_plan[i], cfg_dial[i], cfg_chat[i]); ??????????????????????????????????????? } ??????????????????????????????????????? else ??????????????????????????????????????? { ????????????? ??????????????????????????????????printf("%s %s|%s %s %s %s\n", dial[d], chat[c], cfg_item[i], cfg_plan[i], cfg_dial[i], cfg_chat[i]); ??????????????????????????????????????? } ??????????????????????????????????????????????? ????????????????????????????? ??????????out_cnt++; ??????????????????????????????? } ??????????????????????? } ??????????????????????? ??????????????????????? if(out_cnt == 0) ??????????????????????? { ??????????????????????????????? printf("NULL\n"); ??????????????????????? } ?????? ?????????} ??????? } } |
?
?
?
?
?