第一種方法:用正則表達式來分析
- 轉自網上的一個實例:所有的href都抽取出來:??
- using?System;??
- using?System.Net;??
- using?System.Text;??
- using?System.Text.RegularExpressions;??
- namespace?HttpGet??
- {??
- ????class?Class1??
- ????{??
- ????????[STAThread]??
- ????????static?void?Main(string[]?args)??
- ????????{??
- ????????????System.Net.WebClient?client?=?new?WebClient();??
- ????????????byte[]?page?=?client.DownloadData("http://www.google.com");??
- ????????????string?content?=?System.Text.Encoding.UTF8.GetString(page);??
- ????????????string?regex?=?"href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";??
- ????????????Regex?re?=?new?Regex(regex);??
- ????????????MatchCollection?matches?=?re.Matches(content);??
- ??
- ????????????System.Collections.IEnumerator?enu?=?matches.GetEnumerator();??
- ????????????while?(enu.MoveNext()?&&?enu.Current?!=?null)??
- ????????????{??
- ????????????????Match?match?=?(Match)(enu.Current);??
- ????????????????Console.Write(match.Value?+?"\r\n");??
- ????????????}??
- ????????}??
- ????}??
- }??
- ??
- 一些爬蟲的HTML解析中也是用的類似的方法。??
1.? 下載:
http://www.netomatix.com/Products/DocumentManagement/HTMLParserNet.aspx
2. 使用實例1
- using?System;??
- using?System.Collections.Generic;??
- using?System.ComponentModel;??
- using?System.Data;??
- using?System.Drawing;??
- using?System.Linq;??
- using?System.Text;??
- using?System.Windows.Forms;??
- using?Winista.Text.HtmlParser;??
- using?Winista.Text.HtmlParser.Lex;??
- using?Winista.Text.HtmlParser.Util;??
- using?Winista.Text.HtmlParser.Tags;??
- using?Winista.Text.HtmlParser.Filters;??
- ??
- ??
- namespace?HTMLParser??
- {??
- ????public?partial?class?Form1?:?Form??
- ????{??
- ????????public?Form1()??
- ????????{??
- ????????????InitializeComponent();??
- ????????????AddUrl();??
- ????????}??
- ??
- ????????private?void?btnParser_Click(object?sender,?EventArgs?e)??
- ????????{??
- ????????????#region?獲得網頁的html??
- ????????????try??
- ????????????{??
- ??
- ????????????????txtHtmlWhole.Text?=?"";??
- ????????????????string?url?=?CBUrl.SelectedItem.ToString().Trim();??
- ????????????????System.Net.WebClient?aWebClient?=?new?System.Net.WebClient();??
- ????????????????aWebClient.Encoding?=?System.Text.Encoding.Default;??
- ????????????????string?html?=?aWebClient.DownloadString(url);??
- ????????????????txtHtmlWhole.Text?=?html;??
- ????????????}??
- ????????????catch?(Exception?ex)??
- ????????????{??
- ????????????????MessageBox.Show(ex.Message);??
- ????????????}??
- ????????????#endregion??
- ?
- ????????????#region?分析網頁html節點??
- ????????????Lexer?lexer?=?new?Lexer(this.txtHtmlWhole.Text);??
- ????????????Parser?parser?=?new?Parser(lexer);??
- ????????????NodeList?htmlNodes?=?parser.Parse(null);??
- ????????????this.treeView1.Nodes.Clear();??
- ????????????this.treeView1.Nodes.Add("root");??
- ????????????TreeNode?treeRoot?=?this.treeView1.Nodes[0];??
- ????????????for?(int?i?=?0;?i?<?htmlNodes.Count;?i++)??
- ????????????{??
- ????????????????this.RecursionHtmlNode(treeRoot,?htmlNodes[i],?false);??
- ????????????}??
- ?
- ????????????#endregion??
- ??
- ????????}??
- ??
- ????????private?void?RecursionHtmlNode(TreeNode?treeNode,?INode?htmlNode,?bool?siblingRequired)??
- ????????{??
- ????????????if?(htmlNode?==?null?||?treeNode?==?null)?return;??
- ??
- ????????????TreeNode?current?=?treeNode;??
- ????????????TreeNode?content?;??
- ????????????//current?node??
- ????????????if?(htmlNode?is?ITag)??
- ????????????{??
- ????????????????ITag?tag?=?(htmlNode?as?ITag);??
- ????????????????if?(!tag.IsEndTag())??
- ????????????????{??
- ????????????????????string?nodeString?=?tag.TagName;??
- ????????????????????if?(tag.Attributes?!=?null?&&?tag.Attributes.Count?>?0)??
- ????????????????????{??
- ????????????????????????if?(tag.Attributes["ID"]?!=?null)??
- ????????????????????????{??
- ????????????????????????????nodeString?=?nodeString?+?"?{?id=\""?+?tag.Attributes["ID"].ToString()?+?"\"?}";??
- ????????????????????????}??
- ????????????????????????if?(tag.Attributes["HREF"]?!=?null)??
- ????????????????????????{??
- ????????????????????????????nodeString?=?nodeString?+?"?{?href=\""?+?tag.Attributes["HREF"].ToString()?+?"\"?}";??
- ????????????????????????}??
- ????????????????????}??
- ??????????????????????
- ????????????????????current?=?new?TreeNode(nodeString);??
- ????????????????????treeNode.Nodes.Add(current);??
- ????????????????}??
- ????????????}??
- ??
- ????????????//獲取節點間的內容??
- ????????????if?(htmlNode.Children?!=?null?&&?htmlNode.Children.Count?>?0)??
- ????????????{??
- ????????????????this.RecursionHtmlNode(current,?htmlNode.FirstChild,?true);??
- ????????????????content?=?new?TreeNode(htmlNode.FirstChild.GetText());??
- ????????????????treeNode.Nodes.Add(content);??
- ????????????}??
- ??
- ????????????//the?sibling?nodes??
- ????????????if?(siblingRequired)??
- ????????????{??
- ????????????????INode?sibling?=?htmlNode.NextSibling;??
- ????????????????while?(sibling?!=?null)??
- ????????????????{??
- ????????????????????this.RecursionHtmlNode(treeNode,?sibling,?false);??
- ????????????????????sibling?=?sibling.NextSibling;??
- ????????????????}??
- ????????????}??
- ????????}??
- ????????private?void?AddUrl()??
- ????????{??
- ????????????CBUrl.Items.Add("http://www.hao123.com");??
- ????????????CBUrl.Items.Add("http://www.sina.com");??
- ????????????CBUrl.Items.Add("http://www.heuet.edu.cn");??
- ????????}??
- ??
- ??????????
- ??
- ????}??
- }??
運行效果:
實現取來很容易,結合Winista.Htmlparser源碼很快就可以實現想要的效果。
3. 使用實例2
- using?Winista.Text.HtmlParser;??
- ??
- using?Winista.Text.HtmlParser.Tags;??
- ??
- using?Winista.Text.HtmlParser.Filters;??
- ??
- using?Winista.Text.HtmlParser.Util;??
- ??
- ????????????string?str?=?"<table><tr><td>姓名</td><td>林肯</td></tr><tr><td>年齡</td><td>28</td></tr><tr><td>性別</td><td>男</td></tr><tr><td>姓名</td><td>克林頓</td></tr><tr><td>年齡</td><td>38</td></tr><tr><td>性別</td><td>男</td></tr></table>";??
- ??
- ????????????Parser?parser?=?Parser.CreateParser(?str?,?null?);??
- ??
- ????????????NodeList?nodes?=?parser.ExtractAllNodesThatMatch(?new?TagNameFilter("td")?);??
- ??
- ????????????this.Page.Response.Write("<b>原來的html:</b>"?+?str??);??
- ??
- ????????????this.Page.Response.Write("<b>轉換后的html:</b><br>"?);??
- ??
- ????????????for(?int?i?=?5?;?i?>=?0?;?i--?)??
- ??
- ????????????{??
- ??
- ????????????????this.Page.Response.Write(?nodes[i*2].FirstChild.ToHtml()?+?nodes[i*2+1].FirstChild.GetText()?+?"<br>"?);??
- ??
- ????????????}??
運行結果如下:
原來的html:
姓名 | 林肯 |
年齡 | 28 |
性別 | 男 |
姓名 | 克林頓 |
年齡 | 38 |
性別 | 男 |
性別男
年齡38
姓名克林頓
性別男
年齡28
姓名林肯
0
0