注意:并没有实现CSS中的图片采集,且图片的正则还有待完善。
复制代码 代码如下: using System; using System.Data; using System.Configuration; using System.Web; using System.Web.Security; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.WebControls.WebParts; using System.Web.UI.HtmlControls;
//引入空间 using System.Net; using System.IO; using System.Text; using System.Text.RegularExpressions; using System.Collections;
/// <summary> /// 采集 /// </summary> public class caiji { public caiji() { // // TODO: 在此处添加构造函数逻辑 // }
/// <summary> /// 要采集的网页的连接地址 /// </summary> /// <param>url</param> /// <returns></returns> public static string caijiByUrl(string url,string chargest,string path) { string str = GetSourceTextByUrl(url,chargest);
ArrayList lib = new ArrayList();
int i = 0; //根据url取得网站域名 Uri uri = new Uri(url);
//Scheme或者协议,一般为http,Host为取得域名 string baseurl = uri.Scheme + "://" + uri.Host + "http://www.jb51.net/";
//提取出url,包括src等信息 //S匹配任何非空白字符 Regex g = new Regex(@"(src=(""|')S+.(gif|jpg|png|bmp)(""|'))", RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection m = g.Matches(str);
foreach (Match math in m) { //已经提取到图片的路径了,但还需要分绝对路径,相对路径,以及后缀名是否为图片,因为可能为.asp,.aspx这些,比如验证码图片 string imgUrl = math.Groups[0].Value.ToLower();//转成小写,=号之间可能有不定的空格
//去除src与单引号,双引号 imgUrl = imgUrl.Replace("src",""); imgUrl = imgUrl.Replace(""",""); imgUrl = imgUrl.Replace("'",""); imgUrl = imgUrl.Replace("=",""); imgUrl = imgUrl.Trim();
//路径处理 if (imgUrl.Substring(0, 4) != "http") { //需要判断是否是绝对路径还是相对路径 if (imgUrl.Substring(0, 1) == "http://www.jb51.net/") { imgUrl = baseurl + imgUrl; } else { imgUrl = url.Substring(0,url.LastIndexOf("http://www.jb51.net/") + 1) + imgUrl; } }
//判断元素是否已经存在,-1为不存在 if (lib.IndexOf(imgUrl) == -1) { lib.Add(imgUrl); } }
string str_ = string.Empty; WebClient client = new WebClient();
for (int j = 0; j < lib.Count; j++) { string savepath = path + DateTime.Now.Month + DateTime.Now.Day + DateTime.Now.Minute + DateTime.Now.Second + j + lib[j].ToString().Substring((lib[j].ToString().Length) -4,4); try { client.DownloadFile(new Uri(lib[j].ToString()), savepath); str_ += lib[j].ToString() + "<br /> 保存路径为:" + savepath + "<br /><br />"; } catch (Exception e) { str_ += e.Message; }
}
return str_; }
public static string GetSourceTextByUrl(string url,string chargest) { WebRequest request = WebRequest.Create(url); request.Timeout = 20000;//20秒超时 WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream(); StreamReader sr = new StreamReader(resStream,Encoding.GetEncoding(chargest)); return sr.ReadToEnd(); } }
使用:比如我是保存到upload文件夹中的:
复制代码 代码如下: string path = Server.MapPath("~/upload/"); Response.Write(caiji.caijiByUrl(//www.jb51.net, "utf-8", path));
(编辑:焦作站长网)
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!
|