3-11

Profile Picture
- Published on Mar 11, 2020🌏 Public

发起Http请求

WebClient进行Http请求并获取内容

            //WebClient方法是对底层的HttpWebRequest进行了封装,变成了更易调用的形式
            //适用于大部分情况
            //但是我们这个网站,始终乱码,调整字符集,结果依旧。
            //最后发现是网站对内容进行了GZip压缩,WebClient不会进行解压,导致始终乱码
            WebClient client = new WebClient();
            client.Encoding = Encoding.GetEncoding("utf-8");
            string str=  client.DownloadString("http://www.xbiquge.la/21/21549/10832214.html");

            Console.WriteLine(str);

一般网站,确认网站的编码格式,一般在Html文件头部中有描述编码格式,如果Utf-8,不用特地设置,Gb2312要修改client的Encoding属性。

如果发现网站修改编码格式还是乱码,那可能就是网站开启内容传输压缩了。

这时使用WebClient就没有太大意义了,我们换成更底层的HttpWebRequest

HttpWebRequest

重点是对于有ZGip压缩的网站,需要配置AutomaticDecompression属性,设置自动解压。

            //HttpWebRequest更底层,可以配置的内容更多,其中就包括能够进行网页内容解压缩
            //建立请求示例,显示转换成HttpWebRequest才会有AutomaticDecompression属性
            HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create("http://www.xbiquge.la/21/21549/10832214.html");
            //【重点】AutomaticDecompression配置成GZIP压缩,非必要的,我们这里为了解决下载的html文件经过GZip压缩,导致乱码
            xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
            //要获取html内容,需要分4步进行
            //1.获取服务器响应对象
            WebResponse res = xhr.GetResponse();
            //2.读取响应的二进制数据流
            Stream stream = res.GetResponseStream();
            //3.建立一个二进制数据流的读取器
            StreamReader sr = new StreamReader(stream, Encoding.UTF8);
            //4.调用ReadToEnd(),将字符串数据读取出来
            string html = sr.ReadToEnd();
            //5.释放读取器,否则连接会一直被占用
            sr.Dispose();

            Console.WriteLine(html);

由于我们需要频繁地调用获取页面内容的方法,我们将获取html内容的方法封装出来

先将url提成变量,然后,选取url后面一行开始到结尾内容,封装成新方法:

            string url = "http://www.xbiquge.la/21/21549/10832214.html";

            HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create(url);
        
            xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
        
            WebResponse res = xhr.GetResponse();
          
            Stream stream = res.GetResponseStream();
         
            StreamReader sr = new StreamReader(stream, Encoding.UTF8);
           
            string html = sr.ReadToEnd();

            Console.WriteLine(html);

            Console.ReadKey();
        static void Main()
        {
            string url = "http://www.xbiquge.la/21/21549/10832214.html";

            string html = GetHtml(url);

            Console.WriteLine(html);

            Console.ReadKey();
        }

        private static string GetHtml(string url)
        {
            HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create(url);

            xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;

            WebResponse res = xhr.GetResponse();

            Stream stream = res.GetResponseStream();

            StreamReader sr = new StreamReader(stream, Encoding.UTF8);

            string html = sr.ReadToEnd();
            return html;
        }

以后要调用,使用GetHtml(地址)就可以了。

完整示例

using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;


namespace Test4
{
    class Program
    {
        static void Main()
        {
            string html = GetHtml("http://www.xbiquge.la/21/21549/");
            html = GetArticleIndexHtml(html);
            var list=  GetTagContent(html, "a",true);
            foreach (var item in list)
            {
                try
                {
                    var href = GetTagAttr(item, "a", "href")[0];
                    var title = GetTagContent(item, "a", false)[0];
                    string ahtml = GetHtml("http://www.xbiquge.la" + href);
                    string txt = GetArticleText(ahtml);
                    File.WriteAllText("E:\\C1\\" + title + ".txt", txt);
                    Thread.Sleep(2000);
                }
                catch (Exception ex)
                {
                }
            }
         }

        /// <summary>
        ///     获取目录div内的内容
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        private static string GetArticleIndexHtml(string html) {
            var startIndex = html.IndexOf("<div id=\"list\">");
            html = html.Substring(startIndex);
            var endIndex = html.IndexOf("</div>");
          
            return html.Substring(0, endIndex);
        }

        /// <summary>
        ///   获取章节内容
        /// </summary>
        /// <param name="html">文章页的html代码</param>
        /// <returns>文章的纯文本内容</returns>
        private static string GetArticleText(string html)
        {
            var startIndex = html.IndexOf("<div id=\"content\">");
            html = html.Substring(startIndex);
            var endIndex = html.IndexOf("</div>");
            html = html.Substring(0, endIndex);
            html = GetSimpleText(html);
            return html;
        }

       

        /// <summary>
        /// 下载Html页面内容
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private static string GetHtml(string url)
        {
            HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create(url);

            xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
            try
            {
                WebResponse res = xhr.GetResponse();
                Stream stream = res.GetResponseStream();
                StreamReader sr = new StreamReader(stream, Encoding.UTF8);
                string html = sr.ReadToEnd();
                sr.Dispose();
                return html;
            }
            catch (Exception ex)
            {
               var errUrl= url;
                throw;
            }
        }



        /// <summary>
        /// *根据Html内容,提取纯文本
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static string GetSimpleText(string html)
        {
            html = System.Text.RegularExpressions.Regex.Replace(html, @"<br\s?/?>", "\r\n", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            html = System.Text.RegularExpressions.Regex.Replace(html, @"<\/*[^<>]*>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            html = html.Replace("&nbsp;", " ");
            return html;
        }
        /// <summary>  
        /// *获取字符中指定标签的值  
        /// </summary>  
        /// <param name="str">字符串</param>  
        /// <param name="tagName">标签</param>  
        /// <param name="attrib">属性名</param>  
        /// <returns>属性</returns>  
        public static List<string> GetTagAttr(string str, string tagName, string attrib)
        {
            string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<url>[^'\"\"\\s>]+)\\1[^>]*>", tagName, attrib);
            //获取<Script>属性值  

            MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);

            List<string> list = new List<string>();
            foreach (Match m in titleMatch)
            {
                string result = m.Groups["url"].Value;
                if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;
                list.Add(result);
            }
            return list;
        }

        /// <summary>  
        /// *获取字符中指定标签的值  
        /// </summary>  
        /// <param name="str">字符串</param>  
        /// <param name="tagName">标签</param> 
        /// <param name="containTag">是否需要包含标签本身</param>
        /// <returns>值</returns>  
        public static List<string> GetTagContent(string str, string tagName, bool containTag)
        {
            string tmpStr = string.Format("(?<Tag><{0}[^>]*?>(?<Text>[^<]*)</{1}>)", tagName, tagName); //获取<Script>之间内容  

            MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);

            List<string> list = new List<string>();
            foreach (Match m in titleMatch)
            {
                string result = containTag?m.Groups["Tag"].Value: m.Groups["Text"].Value;
                if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;

                list.Add(result);
            }
            return list;
        }
    }
}

下载图片

        static void Main()
        {

            var url = "https://pic.cnblogs.com/avatar/227460/20160408163205.png";
            var filename = url.Substring(url.LastIndexOf("/") + 1);

            //使用webclient下载
            WebClient client = new WebClient();
            client.DownloadFile("https://pic.cnblogs.com/avatar/227460/20160408163205.png", "E:\\C1\\" + filename);


            //根据文件名使用WebRequest
            HttpWebRequest xhr = (HttpWebRequest)HttpWebRequest.Create("https://pic.cnblogs.com/avatar/227460/20160408163205.png");
            WebResponse res = xhr.GetResponse();
            Stream stream = res.GetResponseStream();
            System.Drawing.Image.FromStream(stream).Save("E:\\C1\\" + filename);
      


            //自己根据下载的图片判断文件格式,让后重命名存储
            HttpWebRequest xhr2 = (HttpWebRequest)HttpWebRequest.Create("https://pic.cnblogs.com/avatar/227460/20160408163205.png");
            WebResponse res2 = xhr.GetResponse();
            Stream stream2 = res.GetResponseStream();
            var img = System.Drawing.Image.FromStream(stream2);
            img.Save("E:\\C1\\" + DateTime.Now.ToString("yyyyMMddHHmmssfff") + GetImageExt(img));
    
        }

        private static string GetImageExt(Image _img)
        {
            if (_img.RawFormat.Equals(System.Drawing.Imaging.ImageFormat.Png))
            {
                return ".png";
            }
            if (_img.RawFormat.Equals(System.Drawing.Imaging.ImageFormat.Jpeg))
            {
                return ".jpg";
            }
            if (_img.RawFormat.Equals(System.Drawing.Imaging.ImageFormat.Gif))
            {
                return ".gif";
            }
           
            if (_img.RawFormat.Equals(System.Drawing.Imaging.ImageFormat.Bmp))
            {
                return ".bmp";

            }

            return null;
        }

练习

找一个小说站,找个小说,去下载小说的所有章节到本地磁盘上。

上面的加*号的方法,不需要自己写,直接复制使用即可。