微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

c# – 为什么下载的并发数量有限制?

我正在尝试制作自己的简单网络抓取工具.我想从URL下载具有特定扩展名的文件.我写了以下代码
private void button1_Click(object sender,RoutedEventArgs e)
    {
        if (bw.IsBusy) return;
        bw.DoWork += new DoWorkEventHandler(bw_DoWork);
        bw.RunWorkerAsync(new string[] { URL.Text,SavePath.Text,Filter.Text });
    }
    //--------------------------------------------------------------------------------------------
    void bw_DoWork(object sender,DoWorkEventArgs e)
    {
        try
        {
            ThreadPool.SetMaxThreads(4,4);
            string[] strs = e.Argument as string[];
            Regex reg = new Regex("<a(\\s*[^>]*?){0,1}\\s*href\\s*\\=\\s*\\\"([^>]*?)\\\"\\s*[^>]*>(.*?)</a>",RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase);
            int i = 0;
            string domainS = strs[0];
            string Extensions = strs[2];
            string OutDir = strs[1];
            var domain = new Uri(domainS);
            string[] Filters = Extensions.Split(new char[] { ';',',' ' },StringSplitOptions.RemoveEmptyEntries);
            string outPath = System.IO.Path.Combine(OutDir,string.Format("File_{0}.html",i));

            WebClient webClient = new WebClient();
            string str = webClient.DownloadString(domainS);
            str = str.Replace("\r\n"," ").Replace('\n',' ');
            MatchCollection mc = reg.Matches(str);
            int NumOfThreads = mc.Count;

            Parallel.ForEach(mc.Cast<Match>(),new ParallelOptions { MaxDegreeOfParallelism = 2,},mat =>
            {
                string val = mat.Groups[2].Value;
                var link = new Uri(domain,val);
                foreach (string ext in Filters)
                    if (val.EndsWith("." + ext))
                    {
                        Download((object)new object[] { OutDir,link });
                        break;
                    }
            });
            throw new Exception("Finished !");

        }
        catch (System.Exception ex)
        {
            ReportException(ex);
        }
        finally
        {

        }
    }
    //--------------------------------------------------------------------------------------------
    private static void Download(object o)
    {
        try
        {
            object[] objs = o as object[];
            Uri link = (Uri)objs[1];
            string outPath = System.IO.Path.Combine((string)objs[0],System.IO.Path.GetFileName(link.ToString()));
            if (!File.Exists(outPath))
            {
                //WebClient webClient = new WebClient();
                //webClient.DownloadFile(link,outPath);

                DownloadFile(link.ToString(),outPath);
            }
        }
        catch (System.Exception ex)
        {
            ReportException(ex);
        }
    }
    //--------------------------------------------------------------------------------------------
    private static bool DownloadFile(string url,string filePath)
    {
        try
        {
            HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
            request.UserAgent = "Web Crawler";
            request.Timeout = 40000;
            WebResponse response = request.GetResponse();
            Stream stream = response.GetResponseStream();
            using (FileStream fs = new FileStream(filePath,FileMode.CreateNew))
            {
                const int siz = 1000;
                byte[] bytes = new byte[siz];
                for (; ; )
                {
                    int count = stream.Read(bytes,siz);
                    fs.Write(bytes,count);
                    if (count == 0) break;
                }
                fs.Flush();
                fs.Close();
            }
        }
        catch (System.Exception ex)
        {
            ReportException(ex);
            return false;
        }
        finally
        {

        }
        return true;
    }

问题是虽然它适用于2个并行下载:

new ParallelOptions { MaxDegreeOfParallelism = 2,}

…它不适用于更大程度的并行性,如:

new ParallelOptions { MaxDegreeOfParallelism = 5,}

…我得到连接超时异常.

起初我以为是因为WebClient:

//WebClient webClient = new WebClient();
                //webClient.DownloadFile(link,outPath);

…但是当我用使用HttpWebRequest的函数DownloadFile替换它时,我仍然遇到错误.

我已在许多网页上测试过,没有任何改变.我还确认了chrome的扩展“Download Master”,这些Web服务器允许多个并行下载.
有没有人知道为什么我会超时尝试并行下载多个文件

解决方法

您需要分配 ServicePointManager.DefaultConnectionLimit.同一主机的认并发连接是2.有关使用web.config connectionManagement的信息,请参阅 related SO post.

原文地址:https://www.jb51.cc/csharp/99437.html

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐