網頁自動登錄和提交POST信息的核心就是分析網頁的源代碼(HTML),在C#中,可以用來提取網頁HTML的組件比較多,常用的用WebBrowser、WebClient、HttpWebRequest這三個。
以下就分別用這三種方法來實現:
1、WebBrowser是個"迷你"瀏覽器,其特點是Post時不用關心Cookie、內置JS等問題
WebBrowser是VS2005新提供的組件(其實就是封裝了IE接口),實現POST功能一般在webBrowser的DocumentCompleted中分析HtmlDocument 來實現,代碼如下:
?????????? HtmlElement ClickBtn =null;
?????????? if (e.Url.ToString().ToLower().IndexOf("http://sandou.cnblogs.com/") > 0)?? //登陸頁面
??????????? {
??????????????? HtmlDocument doc = webBrowser1.Document;
??????????????? for (int i = 0; i < doc.All.Count ; i++)
??????????????? {
??????????????????? if (doc.All[i].TagName.ToUpper().Equals("INPUT"))
??????????????????? {
??????????????????????? switch (doc.All[i].Name)
??????????????????????? {
??????????????????????????? case "userCtl":
??????????????????????????????? doc.All[i].InnerText = "user01";
??????????????????????????????? break;
??????????????????????????? case "passCt1":
??????????????????????????????? doc.All[i].InnerText = "mypass";
??????????????????????????????? break;
??????????????????????????? case "B1":
??????????????????????????????? ClickBtn = doc.All[i]; //提交按鈕
??????????????????????????????? break;
??????????????????????? }
??????????????????? }
??????????????? }
??????????????? ClickBtn.InvokeMember("Click");?? //執行按扭操作
??????????? }
2、WebClient封裝了HTTP的一些類,操作簡單,相較于webBrowser,特點是可以自設代理,缺點是對COOKIE的控制
WebClient的運行全在后臺,并且提供了異步操作的能力,這樣很方便并發多個任務,然后等待結果的返回,再逐個處理。多任務異步調用的代碼如下:
?private void StartLoop(int ProxyNum)
??????? {
?????????? WebClient []? wcArray = new WebClient[ProxyNum];? //初始化
???????????? for (int idArray = 0; idArray< ProxyNum;idArray++)
??????????? {
???????????????? wcArray[idArray] = new WebClient();
??????????????? wcArray[idArray].OpenReadCompleted += new OpenReadCompletedEventHandler(Pic_OpenReadCompleted2);
??????????????? wcArray[idArray].UploadDataCompleted += new UploadDataCompletedEventHandler(Pic_UploadDataCompleted2);
??????????????? try
??????????????? {
???????????????????
??????????????????? wcArray[idArray].Proxy = new WebProxy(proxy[1], port);
??????????????????? wcArray[idArray].OpenReadAsync(new Uri("http://sandou.cnblogs.com/")); //打開WEB;
??????????????????? proxy = null;
??????????????? }
??????????????? catch
??????????????? {
??????????????? }
??????????? }
??????? }
??????? private void Pic_OpenReadCompleted2(object sender, OpenReadCompletedEventArgs e)
??????? {
??????????????? if (e.Error == null)
??????????????? {
??????????????????????????? string textData = new StreamReader(e.Result, Encoding.Default).ReadToEnd();? //取返回信息
???????????????????????????? ..
????????????????????????????? String cookie = ((WebClient)sender).ResponseHeaders["Set-Cookie"];
???????????????????????????? ((WebClient)sender).Headers.Add("Content-Type", "application/x-www-form-urlencoded");
??????????????????????????? ((WebClient)sender).Headers.Add("Accept-Language", "zh-cn");
??????????????????????????? ((WebClient)sender).Headers.Add("Cookie", cookie);
??????????????????????????? string postData = ""
??????????????????????????? byte[] byteArray = Encoding.UTF8.GetBytes(postData); // 轉化成二進制數組
?????????????????????????? ((WebClient)sender).UploadDataAsync(new Uri("http://sandou.cnblogs.com/"), "POST", byteArray);
??????????????? }
???????? }
??????? private void Pic_UploadDataCompleted2(object sender, UploadDataCompletedEventArgs e)
??????? {
???????????????? if (e.Error == null)
??????????????? {
??????????????????? string returnMessage = Encoding.Default.GetString(e.Result);
???????????????????
??????????????? }
?????? }
?
3、HttpWebRequest較為低層,能實現的功能較多,Cookie操作也很簡單:
?
?????? private bool? PostWebRequest()???????
??????? {
?????????????????? CookieContainer cc = new CookieContainer();
??????????????????? string pos tData = "user=" + strUser + "&pass=" + strPsd;
??????????????????? byte[] byteArray = Encoding.UTF8.GetBytes(postData); // 轉化
??????????????????? HttpWebRequest webRequest2 = (HttpWebRequest)WebRequest.Create(new Uri(http://sandou.cnblogs.com/));
??????????????????? webRequest2.CookieContainer = cc;
??????????????????? webRequest2.Method = "POST";
??????????????????? webRequest2.ContentType = "application/x-www-form-urlencoded";
??????????????????? webRequest2.ContentLength = byteArray.Length;
??????????????????? Stream newStream = webRequest2.GetRequestStream();
??????????????????? // Send the data.
??????????????????? newStream.Write(byteArray, 0, byteArray.Length);??? //寫入參數
??????????????????? newStream.Close();
??????????????????? HttpWebResponse response2 = (HttpWebResponse)webRequest2.GetResponse();
??????????????????? StreamReader sr2=new StreamReader(response2.GetResponseStream(), Encoding.Default);
??????????????????? string text2 =? sr2.ReadToEnd();
?????????????????
??????? }????
HttpWebRequest 實現, 這個是從網上COPY 的!我以前用相關的代碼登錄到WWW.ASP.NET上,并且成功post,可惜代碼不知道放什么地方了。
HttpWebRequest自動登錄網站并獲取網站內容(不包含驗證碼的網站)
可以使用 Visual Sniffer(百度搜索) 來捕捉提交的數據信息:
1. 訪問你需要站外提交的頁面,比如 CSDN 登陸頁 http://www.csdn.net/member/UserLogin.aspx
2. 填寫好需要的資料,比如用戶名和密碼,
3. 打開 Visual Sniffer, 點“開始攔截”
4. 在訪問的頁面中提交。
5. 等提交成功之后,在 Visual Sniffer 中“停止攔截”
6. 在 Visual Sniffer 的左側欄的加號中依次點開,右邊是它攔截到的內容:
?
POST http://www.csdn.net/member/UserLogin.aspx HTTP/1.0
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*
Referer: http://www.csdn.net/member/UserLogin.aspx
Accept-Language: zh-cn
Content-Type: application/x-www-form-urlencoded
UA-CPU: x86
Pragma: no-cache
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; InfoPath.1)
Host: www.csdn.net
Content-Length: 355
Proxy-Connection: Keep-Alive
Cookie: ASPSESSIONIDAAAATBQC=FMEGGCKDBKHAMMCGKPFDMBFG; ASP.NET_SessionId=lusprmnom05lr445tmteaf55; userid=699879
__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=dDwtMTcwMzgxNjQ2Mjs7bDxDU0ROVXNlckxvZ2luOmNiX1NhdmVTdGF0ZTtDU0ROVXNlckxvZ2luOkltYWdlX0xvZ2luOz4%2Btu1q2wmRZoAJTi9L73w1zBleylY%3D&CSDNUserLogin%3Atb_UserName=testusername&CSDNUserLogin%3Atb_Password=testpassword&CSDNUserLogin%3Atb_ExPwd=9232&from=&CSDNUserLogin%3AImage_Login.x=36&CSDNUserLogin%3AImage_Login.y=6
GET http://www.csdn.net/mycustompage.htm?aspxerrorpath=/member/UserLogin.aspx HTTP/1.0
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*
Referer: http://www.csdn.net/member/UserLogin.aspx
Accept-Language: zh-cn
UA-CPU: x86
Pragma: no-cache
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; InfoPath.1)
Host: www.csdn.net
Proxy-Connection: Keep-Alive
Cookie: ASPSESSIONIDAAAATBQC=FMEGGCKDBKHAMMCGKPFDMBFG; ASP.NET_SessionId=lusprmnom05lr445tmteaf55; userid=699879
以上為攔截內容,其中提交數據的參數部分(程序中的:strArgs)如:
__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=dDwtMTcwMzgxNjQ2Mjs7bDxDU
0ROVXNlckxvZ2luOmNiX1NhdmVTdGF0ZTtDU0ROVXNlckxvZ2luOkltYWdlX0xvZ2luOz4%2Btu
1q2wmRZoAJTi9L73w1zBleylY%3D&CSDNUserLogin%3Atb_UserName=testusername&CSDN
UserLogin%3Atb_Password=testpassword&CSDNUserLogin%3Atb_ExPwd=9232
?
??????? protected static string cookieHeader;
??????? private void Page_Load(object sender, System.EventArgs e)
??????? {
??????????? string strReContent = string.Empty;
??????????? //登錄
??????????? strReContent = PostLogin("http://www.mystand.com.cn/login/submit.jsp提交的頁面","提交的參數:userid=hgj0000&password=06045369","引用地址:http://www.mystand.com.cn/");
??????????? //asp.net登錄傳遞的參數需注意???
??????????? //strReContent = PostLogin("http://www.mystand.com.cn/login.aspx","__VIEWSTATE=dDwtNjkzMjUyNDczO3Q8O2w8aTwzPjs%2BO2w8dDxwPHA8bDxUZXh0Oz47bDxcZTs%2BPjs%2BOzs%2BOz4%2BOz6aX2dtqkJTK%2BKbNPsjd7Op%2Fl26Iw%3D%3D&txtUserName=hxf&txtPassword=hxf0000&btnEnter=%E7%99%BB%E5%BD%95","http://www.mystand.com.cn/login.aspx");
??????????? //獲取頁面
??????????? strReContent = GetPage("http://www.mystand.com.cn/company/getdata.jsp?code=","引用地址:http://www.mystand.com.cn/");
??????????? //strReContent = GetPage("http://www.mystand.com.cn/Modules/index.aspx","http://www.mystand.com.cn/login.aspx");
??????????? //可以對獲得的內容進行處理:strReContent
??????? }
??????? /** <summary>
??????? /// 功能描述:模擬登錄頁面,提交登錄數據進行登錄,并記錄Header中的cookie
??????? /// </summary>
??????? /// <param name="strURL">登錄數據提交的頁面地址</param>
??????? /// <param name="strArgs">用戶登錄數據</param>
??????? /// <param name="strReferer">引用地址</param>
??????? /// <returns>可以返回頁面內容或不返回</returns>
??????? public static string PostLogin(string strURL,string strArgs,string strReferer)
??????? {
??????????? string strResult = "";
??????????? HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);
??????????? myHttpWebRequest.AllowAutoRedirect = true;
??????????? myHttpWebRequest.KeepAlive = true;
??????????? myHttpWebRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/msword, application/x-shockwave-flash, */*";
??????????? myHttpWebRequest.Referer = strReferer;
???????????
??????????? myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 2.0.50727)";
??????????? myHttpWebRequest.ContentType = "application/x-www-form-urlencoded";
??????????? myHttpWebRequest.Method = "POST";
??????????? CookieCollection myCookies = null;
??????????? CookieContainer myCookieContainer = new CookieContainer();
??????????? myHttpWebRequest.CookieContainer = myCookieContainer;
??????????? Stream MyRequestStrearm = myHttpWebRequest.GetRequestStream();
??????????? StreamWriter MyStreamWriter = new StreamWriter(MyRequestStrearm,Encoding.ASCII);
??????????? //把數據寫入HttpWebRequest的Request流
??????????? MyStreamWriter.Write(strArgs);
??????????? //關閉打開對象
??????????? MyStreamWriter.Close();
??????????? MyRequestStrearm.Close();
??????????? HttpWebResponse response = null;
??????????? System.IO.StreamReader sr = null;
??????????? response = (HttpWebResponse)myHttpWebRequest.GetResponse();
??????????? cookieHeader = myHttpWebRequest.CookieContainer.GetCookieHeader(new Uri(strURL));
??????????? HttpContext.Current.Application.Lock();
??????????? HttpContext.Current.Application["cookieHeader"] = cookieHeader;
??????????? HttpContext.Current.Application.UnLock();
??????????? myCookies = response.Cookies;
??????????? sr = new System.IO.StreamReader(response.GetResponseStream(),Encoding.GetEncoding("gb2312"));??? //??? //utf-8
??????????? strResult = sr.ReadToEnd();
??????????? return strResult;
??????? }
??????? /** <summary>
??????? /// 功能描述:在PostLogin成功登錄后記錄下Headers中的cookie,然后獲取此網站上其他頁面的內容
??????? /// </summary>
??????? /// <param name="strURL">獲取網站的某頁面的地址</param>
??????? /// <param name="strReferer">引用的地址</param>
??????? /// <returns>返回頁面內容</returns>
??????? public static string GetPage(string strURL,string strReferer)
??????? {
??????????? string strResult = "";
??????????? HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);
??????????? myHttpWebRequest.ContentType = "text/html";
??????????? myHttpWebRequest.Method = "GET";
??????????? myHttpWebRequest.Referer = strReferer;
??????????? myHttpWebRequest.Headers.Add("cookie:"+ cookieHeader);
??????????? HttpWebResponse response = null;
??????????? System.IO.StreamReader sr = null;
??????????? response = (HttpWebResponse)myHttpWebRequest.GetResponse();
??????????? sr = new System.IO.StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));??? //??? //utf-8
??????????? strResult = sr.ReadToEnd();
??????????? return strResult;
??????? }
技術應用——網頁自動登錄(提交Post內容)的用途很多,如驗證身份、程序升級、網絡投票等,以下是用C#實現的方法.
未解決問題——目前最大問題無法繞過驗證碼——我曾經和同事討論圖片的算法,基本上很難識別,網上也有很多識別驗證碼的例子,但是對于簡單的噪聲還是可以的,可是對于復雜的就一點用都沒有了!到目前為止,我沒有測試成功過!如果你有測試成功過,請帖代碼,我們一起研究研究。