3-11
发起Http请求
WebClient进行Http请求并获取内容
//WebClient方法是对底层的HttpWebRequest进行了封装,变成了更易调用的形式
//适用于大部分情况
//但是我们这个网站,始终乱码,调整字符集,结果依旧。
//最后发现是网站对内容进行了GZip压缩,WebClient不会进行解压,导致始终乱码
WebClient client = new WebClient();
client.Encoding = Encoding.GetEncoding("utf-8");
string str= client.DownloadString("http://www.xbiquge.la/21/21549/10832214.html");
Console.WriteLine(str);
一般网站,确认网站的编码格式,一般在Html文件头部中有描述编码格式,如果Utf-8,不用特地设置,Gb2312要修改client的Encoding属性。
如果发现网站修改编码格式还是乱码,那可能就是网站开启内容传输压缩了。
这时使用WebClient就没有太大意义了,我们换成更底层的HttpWebRequest
HttpWebRequest
重点是对于有ZGip压缩的网站,需要配置AutomaticDecompression属性,设置自动解压。
//HttpWebRequest更底层,可以配置的内容更多,其中就包括能够进行网页内容解压缩
//建立请求示例,显示转换成HttpWebRequest才会有AutomaticDecompression属性
HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create("http://www.xbiquge.la/21/21549/10832214.html");
//【重点】AutomaticDecompression配置成GZIP压缩,非必要的,我们这里为了解决下载的html文件经过GZip压缩,导致乱码
xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
//要获取html内容,需要分4步进行
//1.获取服务器响应对象
WebResponse res = xhr.GetResponse();
//2.读取响应的二进制数据流
Stream stream = res.GetResponseStream();
//3.建立一个二进制数据流的读取器
StreamReader sr = new StreamReader(stream, Encoding.UTF8);
//4.调用ReadToEnd(),将字符串数据读取出来
string html = sr.ReadToEnd();
//5.释放读取器,否则连接会一直被占用
sr.Dispose();
Console.WriteLine(html);
由于我们需要频繁地调用获取页面内容的方法,我们将获取html内容的方法封装出来
先将url提成变量,然后,选取url后面一行开始到结尾内容,封装成新方法:
string url = "http://www.xbiquge.la/21/21549/10832214.html";
HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create(url);
xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
WebResponse res = xhr.GetResponse();
Stream stream = res.GetResponseStream();
StreamReader sr = new StreamReader(stream, Encoding.UTF8);
string html = sr.ReadToEnd();
Console.WriteLine(html);
Console.ReadKey();
static void Main()
{
string url = "http://www.xbiquge.la/21/21549/10832214.html";
string html = GetHtml(url);
Console.WriteLine(html);
Console.ReadKey();
}
private static string GetHtml(string url)
{
HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create(url);
xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
WebResponse res = xhr.GetResponse();
Stream stream = res.GetResponseStream();
StreamReader sr = new StreamReader(stream, Encoding.UTF8);
string html = sr.ReadToEnd();
return html;
}
以后要调用,使用GetHtml(地址)就可以了。
完整示例
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
namespace Test4
{
class Program
{
static void Main()
{
string html = GetHtml("http://www.xbiquge.la/21/21549/");
html = GetArticleIndexHtml(html);
var list= GetTagContent(html, "a",true);
foreach (var item in list)
{
try
{
var href = GetTagAttr(item, "a", "href")[0];
var title = GetTagContent(item, "a", false)[0];
string ahtml = GetHtml("http://www.xbiquge.la" + href);
string txt = GetArticleText(ahtml);
File.WriteAllText("E:\\C1\\" + title + ".txt", txt);
Thread.Sleep(2000);
}
catch (Exception ex)
{
}
}
}
/// <summary>
/// 获取目录div内的内容
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private static string GetArticleIndexHtml(string html) {
var startIndex = html.IndexOf("<div id=\"list\">");
html = html.Substring(startIndex);
var endIndex = html.IndexOf("</div>");
return html.Substring(0, endIndex);
}
/// <summary>
/// 获取章节内容
/// </summary>
/// <param name="html">文章页的html代码</param>
/// <returns>文章的纯文本内容</returns>
private static string GetArticleText(string html)
{
var startIndex = html.IndexOf("<div id=\"content\">");
html = html.Substring(startIndex);
var endIndex = html.IndexOf("</div>");
html = html.Substring(0, endIndex);
html = GetSimpleText(html);
return html;
}
/// <summary>
/// 下载Html页面内容
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
private static string GetHtml(string url)
{
HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create(url);
xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
try
{
WebResponse res = xhr.GetResponse();
Stream stream = res.GetResponseStream();
StreamReader sr = new StreamReader(stream, Encoding.UTF8);
string html = sr.ReadToEnd();
sr.Dispose();
return html;
}
catch (Exception ex)
{
var errUrl= url;
throw;
}
}
/// <summary>
/// *根据Html内容,提取纯文本
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static string GetSimpleText(string html)
{
html = System.Text.RegularExpressions.Regex.Replace(html, @"<br\s?/?>", "\r\n", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
html = System.Text.RegularExpressions.Regex.Replace(html, @"<\/*[^<>]*>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
html = html.Replace(" ", " ");
return html;
}
/// <summary>
/// *获取字符中指定标签的值
/// </summary>
/// <param name="str">字符串</param>
/// <param name="tagName">标签</param>
/// <param name="attrib">属性名</param>
/// <returns>属性</returns>
public static List<string> GetTagAttr(string str, string tagName, string attrib)
{
string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<url>[^'\"\"\\s>]+)\\1[^>]*>", tagName, attrib);
//获取<Script>属性值
MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);
List<string> list = new List<string>();
foreach (Match m in titleMatch)
{
string result = m.Groups["url"].Value;
if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;
list.Add(result);
}
return list;
}
/// <summary>
/// *获取字符中指定标签的值
/// </summary>
/// <param name="str">字符串</param>
/// <param name="tagName">标签</param>
/// <param name="containTag">是否需要包含标签本身</param>
/// <returns>值</returns>
public static List<string> GetTagContent(string str, string tagName, bool containTag)
{
string tmpStr = string.Format("(?<Tag><{0}[^>]*?>(?<Text>[^<]*)</{1}>)", tagName, tagName); //获取<Script>之间内容
MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);
List<string> list = new List<string>();
foreach (Match m in titleMatch)
{
string result = containTag?m.Groups["Tag"].Value: m.Groups["Text"].Value;
if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;
list.Add(result);
}
return list;
}
}
}
下载图片
static void Main()
{
var url = "https://pic.cnblogs.com/avatar/227460/20160408163205.png";
var filename = url.Substring(url.LastIndexOf("/") + 1);
//使用webclient下载
WebClient client = new WebClient();
client.DownloadFile("https://pic.cnblogs.com/avatar/227460/20160408163205.png", "E:\\C1\\" + filename);
//根据文件名使用WebRequest
HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create("https://pic.cnblogs.com/avatar/227460/20160408163205.png");
WebResponse res = xhr.GetResponse();
Stream stream = res.GetResponseStream();
System.Drawing.Image.FromStream(stream).Save("E:\\C1\\" + filename);
//自己根据下载的图片判断文件格式,让后重命名存储
HttpWebRequest xhr2 = (HttpWebRequest)HttpWebRequest.Create("https://pic.cnblogs.com/avatar/227460/20160408163205.png");
WebResponse res2 = xhr.GetResponse();
Stream stream2 = res.GetResponseStream();
var img = System.Drawing.Image.FromStream(stream2);
img.Save("E:\\C1\\" + DateTime.Now.ToString("yyyyMMddHHmmssfff") + GetImageExt(img));
}
private static string GetImageExt(Image _img)
{
if (_img.RawFormat.Equals(System.Drawing.Imaging.ImageFormat.Png))
{
return ".png";
}
if (_img.RawFormat.Equals(System.Drawing.Imaging.ImageFormat.Jpeg))
{
return ".jpg";
}
if (_img.RawFormat.Equals(System.Drawing.Imaging.ImageFormat.Gif))
{
return ".gif";
}
if (_img.RawFormat.Equals(System.Drawing.Imaging.ImageFormat.Bmp))
{
return ".bmp";
}
return null;
}
练习
找一个小说站,找个小说,去下载小说的所有章节到本地磁盘上。
上面的加*号的方法,不需要自己写,直接复制使用即可。