運用import過程進行SAS數據導入完全實用教程
1 單個規范格式文件導入。
對單個文件進行導入是我們遇到最多的情況,主要有以下幾種:
1.1 對指定分隔符(’|’,’
’,’!’,’ab’等)數據的導入,這里以’!’為例delimiter='!'進行說明:
data _null_;
file 'c:\temp\pipefile.txt';
put"X1!X2!X3!X4";
put "11!22!.! ";
put "111!.!333!apple";
run;
導入程序:
proc import
datafile='c:\temp\pipefile.txt'
out=work.test
dbms=dlm
replace;
delimiter='!';
GUESSINGROWS=2000;
DATAROW=2;
getnames=yes;
run;
注意GUESSINGROWS的值閾為1 到 3276
1.2 對CSV格式的數據進行導入:
data _null_;
file 'c:\temp\csvfile.csv';
put
"Fruit1,Fruit2,Fruit3,Fruit4";
put
"apple,banana,coconut,date";
put
"apricot,berry,crabapple,dewberry";
run;
導入程序:
proc import
datafile='c:\temp\csvfile.csv'
out=work.fruit
dbms=csv
replace;
run;
1.3 對tab分隔數據的導入:
data _null_;
file 'c:\temp\tabfile.txt';
put "cereal" "09"x "eggs" "09"x
"bacon";
put "muffin" "09"x "berries"
"09"x "toast";
run;
proc import
datafile='c:\temp\tabfile.txt'
out=work.breakfast
dbms=tab
replace;
getnames=no;
run;
1.4 對dbf數據庫數據進行導入:
proc import
datafile="/myfiles/mydata.dbf"
out=sasuser.mydata
dbms=dbf
replace;
run;
1.5對excel數據進行導入:
PROC IMPORT OUT= hospital1
DATAFILE= " C:\My Documents\Excel Files\Hospital1.xls "
DBMS=EXCEL REPLACE;
SHEET="Sheet1$";
GETNAMES=YES;
MIXED=NO;
SCANTEXT=YES;
USEDATE=YES;
SCANTIME=YES;
RUN;
1.6對access數據進行導入:
PROC IMPORT DBMS=ACCESS
TABLE="customers" OUT=sasuser.cust;
DATABASE="c:\demo\customers.mdb";
UID="bob";
PWD="cat";?WGDB="c:\winnt\system32\system.mdb";
RUN;
proc print data=sasuser.cust;
run;
1.7 import過程步中,dbms選項匯總:
Identifier
Input Data
Source
Extension
Host Availability
ACCESS
Microsoft Access 2000
or 2002 table
.mdb
Microsoft Windows *
ACCESS97
Microsoft Access 97 table
.mdb
Microsoft Windows *
ACCESS2000
Microsoft Access 2000 table
.mdb
Microsoft Windows *
ACCESS2002
Microsoft Access 2002 table
.mdb
Microsoft Windows *
ACCESSCS
Microsoft Access table
.mdb
UNIX
CSV
delimited file
(comma-separated values)
.csv
OpenVMS Alpha, UNIX, Microsoft Windows
DBF
dBASE 5.0, IV, III+,
and III files
.dbf
UNIX, Microsoft Windows
DLM
delimited file (default
delimiter is a blank)
.*
OpenVMS Alpha, UNIX, Microsoft Windows
EXCEL
Excel 2000 or 2002
spreadsheet
.xls
Microsoft Windows *
EXCEL4
Excel 4.0
spreadsheet
.xls
Microsoft Windows
EXCEL5
Excel 5.0 or 7.0 (95)
spreadsheet
.xls
Microsoft Windows
EXCEL97
Excel 97 or 7.0 (95)
spreadsheet
.xls
Microsoft Windows *
EXCEL2000
Excel 2000 spreadsheet
.xls
Microsoft Windows *
EXCELCS
Excel spreadsheet
.xls
UNIX
JMP
JMP table
.jmp
UNIX, Microsoft Windows
PCFS
Files on PC server
.*
UNIX
TAB
delimited file (tab-delimited values)
.txt
OpenVMS Alpha, UNIX, Microsoft Windows
WK1
Lotus 1-2-3 Release 2
spreadsheet
.wk1
Microsoft Windows
WK3
Lotus 1-2-3 Release 3
spreadsheet
.wk3
Microsoft Windows
WK4
Lotus 1-2-3 Release 4
or 5 spreadsheet
.wk4
Microsoft Windows
2 導入一個文件夾下的所有文件的數據。
2.1下面的代碼導入一個文件夾下的所有文件的數據,要使用本代碼需注意幾點:首先,這個文件夾下的數據文件必須是同一類型分隔的數據,比如例子中都是tab分隔的txt文件,當然也可以對本代碼進行改進,例如中間的proc
import的dbms改為excel,就可以導入excel文件了。其次,本代碼直接將文件名作為SAS數據集的名字,因此文件名必須是英文,且滿足SAS命名規則。
%macro directory(dir=);
%let
rs=%sysfunc(filename(filref,&dir));
%let
did=%sysfunc(dopen(&filref));
%let nobs=%sysfunc(dnum(&did));
%do i=1 %to &nobs.;
%let
name=%qscan(%qsysfunc(dread(&did,&i)),1,.);
%let
ext=%qscan(%qsysfunc(dread(&did,&i)),-1,.);
proc import
out=&name.
datafile="&dir.\&name..&ext"
dbms=tab replace;
getnames=no;
datarow=1;
run;
%end;
%let rc=%sysfunc(dclose(&did));
%mend;
%directory(dir=C:\PRIVATE);
如果要將數據集進行匯總到一張表,或者則可以直接將proc import
out=&name中的&name改為a&i,然后對所有的a:數據集進行set操作。除此之外,我們還可以對&ext進行設置來達到讀取指定文件格式的數據。
2.2 這里運用pipe讀取到文件名稱,再讀取數據。首先建立三個數據集:
data _null_;
file 'c:\junk\extfile1.txt';
put "05JAN2001 6 W12301 1.59
9.54";
put "12JAN2001 3 P01219 2.99
8.97";
run;
data _null_;
file 'c:\junk\extfile2.txt';
put "02FEB2001 1 P01219 2.99
2.99";
put "05FEB2001 3 A00901 1.99
5.97";
put "07FEB2001 2 C21135 3.00
6.00";
run;
data _null_;
file 'c:\junk\extfile3.txt';
put "06MAR2001 4 A00101 3.59
14.36";
put "12MAR2001 2 P01219 2.99
5.98";
run;
filename blah pipe 'dir C:\Junk /b';
data _null_;
infile blah truncover
end=last;
length fname $20;
input fname;
i+1;
call
symput('fname'||trim(left(put(i,8.))),scan(trim(fname),1,'.'));
call
symput('pext'||trim(left(put(i,8.))),trim(fname));
if last then call
symput('total',trim(left(put(i,8.))));
run;
%macro test;
%do i=1 %to
&total;
proc import
datafile="c:\Junk\&&pext&i"
out=work.&&fname&i
dbms=dlm
replace;
delimiter=' ';
getnames=no ;
run;
proc print
data=work.&&fname&i;;
title
&&fname&i;
run;
%end;
%mend;
%test;
這里,如果要導入指定文件類型的數據,例如txt,則只需要將filename blah pipe
'dir C:\Junk /b';改為filename blah pipe 'dir C:\Junk.*.txt
/b';即可。
除了用filename blah pipe 'dir C:\Junk.*.txt
/b';得到指定類型的文件名,我們還可以%sysexec dir *.xls /b/o:n
>
flist.txt;來將xls文件輸出到指定的文件中,供讀取操作用。這個將在下面的內容作介紹。
3 導入excel表中的所有sheet的數據,并將其匯總到一個數據表中。
3.1 Excel表是sas導入導出最多的數據表之一,本例中,我們將導入一個excel中的不同的數據
%let
dir=C:\ExcelFiles;
%macro ReadXls (inf);
libname excellib excel
"&dir.\&inf";
proc sql noprint;
create table
sheetname as
select
tranwrd(memname, "''", "'") as sheetname
from
sashelp.vstabvw
where
libname="EXCELLIB";
select
count(DISTINCT sheetname) into :cnt_sht
from
sheetname;
select
DISTINCT sheetname into :sheet1 -
:sheet%left(&cnt_sht)
from
sheetname;
quit;
libname excellib clear;
%do i=1 %to &cnt_sht;
proc import
datafile="&dir.\&inf"
out=sheet&i replace;
sheet="&&sheet&i";
getnames=yes;
mixed=yes;
run;
proc append base=master
data=sheet&i force;
run;
%end;
%mend ReadXls;
%ReadXls(all1.xls);
這樣,我們可以通過%ReadXls(all2.xls);
%ReadXls(all3.xls);等來得到多個excel文件的所有數據集。
3.2 我們可以結合3.1和2.1或2.2的方法來讀取多個文件中的多個表。這里再介紹一種新的讀取多個文件的方法:
options noxwait;
%macro ReadXls (dir=);
%sysexec cd &dir; %sysexec dir
*.xls /b/o:n > flist.txt;
data _indexfile;
length filen
$200;
infile
"&dir./flist.txt";
input filen
$;
run;
proc sql noprint;
select
count(filen) into :cntfile from _indexfile;
%if
&cntfile>=1 %then %do;
select filen
into
:filen1-:filen%left(&cntfile)
from
_indexfile;
%end;
quit;
%do i=1 %to &cntfile;
libname excellib excel
"&dir.\&&filen&i";
proc sql noprint;
create table
sheetname as
select
tranwrd(memname, "''", "'") as sheetname
from
sashelp.vstabvw
where
libname="EXCELLIB";
select
count(DISTINCT sheetname) into :cnt_sht
from
sheetname;
select
DISTINCT sheetname into :sheet1 -
:sheet%left(&cnt_sht)
from
sheetname;
quit;
%do j=1 %to &cnt_sht;
proc import
datafile="&dir.\&&filen&i"
out=sheet&j replace;
sheet="&&sheet&j";
getnames=yes;
mixed=yes;
run;
data sheet&j;
length
_excelfilename $100 _sheetname $32;
set
sheet&j;
_excelfilename="&&filen&z";
_sheetname="&&sheet&j";
run;
proc append base=master
data=sheet&j force;
run;
%end;
libname excellib clear;
%end;
%mend ReadXls;
%readxls (dir=C:\ExcelFiles);
4 從多個文件夾下讀取多個數據。
直接給源代碼吧。
%macro etl(ds, ds2,path);
data &ds &ds2;
LENGTH DateTime 8
UserName $ 20
Submit $ 10
SentNumber $ 11
IP $ 15
MessageID $ 15
SendingMode $ 6
Contents $ 160 ;
%let filrf=mydir;
%let
rc=%sysfunc(filename(filrf,"&path"));
%let
did=%sysfunc(dopen(&filrf));
%let
memcount=%sysfunc(dnum(&did));
%do i=1 %to &memcount;
AccountNum+1;
%let counter = AccountNum;
%let
username&i=%sysfunc(dread(&did,&i));
%let filref=mydir2;
%let
file=%sysfunc(filename(filref,"&path\&&username&i"));
%let
op=%sysfunc(dopen(&filref));
%let
flcount=%sysfunc(dnum(&op));
filename FT77F001
"D:\SMSGatewayData2\USERS\&&username&i\*.log";
%do j=1 %to &flcount;
%let
trans&j=%sysfunc(dread(&op,&j));
%put
'&&username&i = '
&&username&i
'&&trans&j= '
&&trans&j
'&flcount = ' &flcount
'&filref = ' &filref
'&filrf = ' &filrf;
infile FT77F001 filename=filename eov=eov end =
done length=L DSD;
INPUT DateTime : ANYDTDTM19.
UserName $
Submit $
SentNumber $
IP $
MessageID $
SendingMode $
Contents $;
output;
%end;
%end;
run;
%mend;
%etl(sms2,
sms,D:\SMSGatewayData2\USERS)
這里就不做測試了,原理其實跟3.2差不多,就是將目錄也參數化。
5 參考文獻:
A Case Study of Importing Multiple Worksheet Files
http://www2.sas.com/proceedings/sugi31/034-31.pdf
Reading multiple files with PROC IMPORT
http://support.sas.com/kb/24/707.html
How to Invoke PROC IMPORT to read a CSV, TAB or Delimited
File
http://ftp.sas.com/techsup/download/sample/datastep/import.html
Use PROC IMPORT to read a CSV, TAB or delimited file.sas
Reading multiple files with PROC IMPORT.sas
directory_import_dbms.sas
proc_import_dbf.html
http://ftp.sas.com/