使用C#抓取网页内容并分析获取数据

使⽤C#抓取⽹页内容分析获取数据
private void button5_Click(object sender, EventArgs e)
{
string html = "";
WebHeaderCollection header = new WebHeaderCollection();
header.Set("Pragma", "no-cache");
html = getHtml("www.biomart/info/infoDemand.htm?pge=1", header);
Regex regex = new Regex("<!-- 列表 -->(?<1>.*)<!-- /列表 -->");
//MessageBox.Show(regex.Match(html).Groups.Count.ToString());
html = regex.Match(html).Groups[1].Value;
regex = new Regex("href=\"(?<1>www\\.biomart\\/infodemand/\\w+\\.htm)\"");
MatchCollection ms = regex.Matches(html);
header.Set(HttpRequestHeader.Cookie, "__utma=124945049.1686326021.1305093063.1305164868.1305187067.3;
__utmz=124945049.1305093063.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JSESSIONID=9D7F7F4B5D73F453DA54B40A53D5E7C8; __utmc=124945049; __utmb=124945049.2.10.1305187067");
foreach (Match m in ms)
{
MessageBox.Show(m.Groups[1].Value);
String content = getHtml(m.Groups[1].Value, header);
regex = new Regex("<div class=\"product_card\">(?<1>.*)\\s+</p>\\s+</div>");
MessageBox.Show(regex.Match(content).Groups[1].Value);
}
}
private String getHtml(String url, WebHeaderCollection header)
{
WebHeaderCollection header = new WebHeaderCollection();
header.Set("", "");
msinfoheader.Set(HttpRequestHeader.Cookie, "");
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("www.biomart/info/infoDemand.htm?pge=1");
request.Timeout = 30000;
request.Headers = header;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");
StreamReader reader = new StreamReader(stream);
String content = reader.ReadToEnd();
content = Regex.Replace(content, "\\t|\\r|\\n", "");
return content;
}

本文发布于:2024-09-22 14:30:16,感谢您对本站的认可!

本文链接:https://www.17tex.com/tex/4/380033.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:内容   抓取   分析
留言与评论(共有 0 条评论)
   
验证码:
Copyright ©2019-2024 Comsenz Inc.Powered by © 易纺专利技术学习网 豫ICP备2022007602号 豫公网安备41160202000603 站长QQ:729038198 关于我们 投诉建议