注意:并没有实现CSS中的图片采集,且图片的正则还有待完善。 
 复制代码 代码如下:   using System;  using System.Data;  using System.Configuration;  using System.Web;  using System.Web.Security;  using System.Web.UI;  using System.Web.UI.WebControls;  using System.Web.UI.WebControls.WebParts;  using System.Web.UI.HtmlControls; 
  //引入空间  using System.Net;  using System.IO;  using System.Text;  using System.Text.RegularExpressions;  using System.Collections; 
  /// <summary>  /// 采集  /// </summary>  public class caiji  {  public caiji()  {  //  // TODO: 在此处添加构造函数逻辑  //  } 
  /// <summary>  /// 要采集的网页的连接地址  /// </summary>  /// <param>url</param>  /// <returns></returns>  public static string caijiByUrl(string url,string chargest,string path)  {  string str = GetSourceTextByUrl(url,chargest); 
  ArrayList lib = new ArrayList(); 
  int i = 0;  //根据url取得网站域名  Uri uri = new Uri(url); 
  //Scheme或者协议,一般为http,Host为取得域名  string baseurl = uri.Scheme + "://" + uri.Host + "http://www.jb51.net/"; 
  //提取出url,包括src等信息  //S匹配任何非空白字符  Regex g = new Regex(@"(src=(""|')S+.(gif|jpg|png|bmp)(""|'))", RegexOptions.Multiline | RegexOptions.IgnoreCase); 
  MatchCollection m = g.Matches(str); 
  foreach (Match math in m)  {  //已经提取到图片的路径了,但还需要分绝对路径,相对路径,以及后缀名是否为图片,因为可能为.asp,.aspx这些,比如验证码图片  string imgUrl = math.Groups[0].Value.ToLower();//转成小写,=号之间可能有不定的空格 
  //去除src与单引号,双引号  imgUrl = imgUrl.Replace("src","");  imgUrl = imgUrl.Replace(""","");  imgUrl = imgUrl.Replace("'","");  imgUrl = imgUrl.Replace("=","");  imgUrl = imgUrl.Trim(); 
  //路径处理  if (imgUrl.Substring(0, 4) != "http")  {  //需要判断是否是绝对路径还是相对路径  if (imgUrl.Substring(0, 1) == "http://www.jb51.net/")  {  imgUrl = baseurl + imgUrl;  }  else  {  imgUrl = url.Substring(0,url.LastIndexOf("http://www.jb51.net/") + 1) + imgUrl;  }  } 
  //判断元素是否已经存在,-1为不存在  if (lib.IndexOf(imgUrl) == -1)  {  lib.Add(imgUrl);  }  } 
  string str_ = string.Empty;  WebClient client = new WebClient(); 
  for (int j = 0; j < lib.Count; j++)  {  string savepath = path + DateTime.Now.Month + DateTime.Now.Day + DateTime.Now.Minute + DateTime.Now.Second + j + lib[j].ToString().Substring((lib[j].ToString().Length) -4,4);  try  {  client.DownloadFile(new Uri(lib[j].ToString()), savepath);  str_ += lib[j].ToString() + "<br /> 保存路径为:" + savepath + "<br /><br />";  }  catch (Exception e)  {  str_ += e.Message;  } 
  } 
  return str_;  } 
  public static string GetSourceTextByUrl(string url,string chargest)  {  WebRequest request = WebRequest.Create(url);  request.Timeout = 20000;//20秒超时  WebResponse response = request.GetResponse(); 
  Stream resStream = response.GetResponseStream();  StreamReader sr = new StreamReader(resStream,Encoding.GetEncoding(chargest));  return sr.ReadToEnd();  }  } 
    使用:比如我是保存到upload文件夹中的: 
 复制代码 代码如下:   string path = Server.MapPath("~/upload/");  Response.Write(caiji.caijiByUrl(//www.jb51.net, "utf-8", path)); 
                          (编辑:焦作站长网) 
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! 
                     |