首页 > 专利查询

使用C#抓取网页内容并分析获取数据

使⽤C#抓取⽹页内容并分析获取数据

private void button5_Click(object sender, EventArgs e)

{

string html = "";

WebHeaderCollection header = new WebHeaderCollection();

header.Set("Pragma", "no-cache");

html = getHtml("www.biomart/info/infoDemand.htm?pge=1", header);

Regex regex = new Regex("(?<1>.*)");

//MessageBox.Show(regex.Match(html).Groups.Count.ToString());

html = regex.Match(html).Groups[1].Value;

regex = new Regex("href=\"(?<1>www\\.biomart\\/infodemand/\\w+\\.htm)\"");

MatchCollection ms = regex.Matches(html);

header.Set(HttpRequestHeader.Cookie, "__utma=124945049.1686326021.1305093063.1305164868.1305187067.3;

__utmz=124945049.1305093063.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JSESSIONID=9D7F7F4B5D73F453DA54B40A53D5E7C8; __utmc=124945049; __utmb=124945049.2.10.1305187067");

foreach (Match m in ms)

{

MessageBox.Show(m.Groups[1].Value);

String content = getHtml(m.Groups[1].Value, header);

regex = new Regex("<div class=\"product_card\">(?<1>.*)\\s+</p>\\s+</div>");

MessageBox.Show(regex.Match(content).Groups[1].Value);

}

private String getHtml(String url, WebHeaderCollection header)

{

WebHeaderCollection header = new WebHeaderCollection();

header.Set("", "");

msinfoheader.Set(HttpRequestHeader.Cookie, "");

HttpWebRequest request = (HttpWebRequest)WebRequest.Create("www.biomart/info/infoDemand.htm?pge=1");

request.Timeout = 30000;

request.Headers = header;

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

Stream stream = response.GetResponseStream();

Encoding encoding = Encoding.GetEncoding("UTF-8");

StreamReader reader = new StreamReader(stream);

String content = reader.ReadToEnd();

content = Regex.Replace(content, "\\t|\\r|\\n", "");

return content;

}

本文发布于:2024-09-22 14:30:16，感谢您对本站的认可！

本文链接：https://www.17tex.com/tex/4/380033.html

上一篇：如何查看Androidapk的包名?

下一篇：木门窗制作的要求

标签：内容抓取分析

留言与评论（共有 0 条评论）