聊城万拓网络科技-专业聊城网站建设、聊城网站制作、聊城网站优化、聊城做网站的品牌网站建设专家!

    您当前的位置是:首页 - 新闻动态 - 网站建设 » 一段C#抓取百度、谷歌、搜狗、360等搜索引擎结果的代码

    一段C#抓取百度、谷歌、搜狗、360等搜索引擎结果的代码
     发布时间:2014-03-26  点击次数: 次   作者:万拓网络  来源:lcbaituo.com  Tags:

    最近做了个项目就是抓取百度、谷歌、搜狗、360等搜索引擎结果的搜索结果,把搜索到的标题和链接一一提取出来。其实页面是很好提取的,主要的问题就是正则表达式处理下载下来的页面。于是在论坛上请教了大家,在大家的帮助下,这个功能的核心代码已经完成,现在整理出来,以提供需要的人参考。

    C# 代码:

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Net;
    using System.IO;
    using httpState;
    using System.Text.RegularExpressions;
    using System.Collections;

    namespace test
    {
        public partial class DownLoadTest : Form
        {
           
            public DownLoadTest()
            {
                InitializeComponent();
            }
            /// <summary>
            /// 百度搜索
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void btnBaidu_Click(object sender, EventArgs e)
            {       
                int num = 20;//搜索条数
                string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + "";
                string html=search(url,"gb2312");
                BaiduSearch baidu = new BaiduSearch();
                if (!string.IsNullOrEmpty(html))
                {
                    int count = baidu.GetSearchCount(html);//搜索条数
                    if (count > 0)
                    {
                        List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
                        dataGridView1.DataSource = keywords;
                    }
                  
                }
            }
            /// <summary>
            /// 谷歌搜索
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void button2_Click(object sender, EventArgs e)
            {
                int num=100;
                string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num="+num+"";
                string html=search(url,"utf-8");
                if (!string.IsNullOrEmpty(html))
                {

                    googleSearch google = new googleSearch();
                    List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
                        dataGridView1.DataSource = keywords;
                   
                }
            }
            /// <summary>
            /// 搜索处理
            /// </summary>
            /// <param name="url">搜索网址</param>
            /// <param name="Chareset">编码</param>
            public string search(string url,string Chareset)
            {
                HttpState result = new HttpState();
                Uri uri = new Uri(url);
                HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
                myHttpWebRequest.UseDefaultCredentials = true;
                myHttpWebRequest.ContentType = "text/html";
                myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
                myHttpWebRequest.Method = "GET";
                myHttpWebRequest.CookieContainer = new CookieContainer();

                try
                {
                    HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
                    // 从 ResponseStream 中读取HTML源码并格式化 add by cqp
                    result.Html = readResponseStream(response, Chareset);
                    result.CookieContainer = myHttpWebRequest.CookieContainer;
                     return result.Html;                
                }
                catch (Exception ex)
                {
                    return ex.ToString();
                }
               
            }
            public string readResponseStream(HttpWebResponse response, string Chareset)
            {
                string result = "";
                using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
                {         
                    result = formatHTML(responseReader.ReadToEnd());
                }

                return result;
            }
            /// <summary>
            /// 描述:格式化网页源码
            ///
            /// </summary>
            /// <param name="htmlContent"></param>
            /// <returns></returns>
            public string formatHTML(string htmlContent)
            {
                string result = "";

                result = htmlContent.Replace("&raquo;", "").Replace("&nbsp;", "")
                        .Replace("&copy;", "").Replace("/r", "").Replace("/t", "")
                        .Replace("/n", "").Replace("&amp;", "&");

                return result;
            }

            class BaiduSearch
            {
                protected string uri = "http://www.baidu.com/s?wd=";
                //protected string uri = "http://www.baidu.com/s?wd=software&pn=10&usm=2"; // 第二页
                protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
                protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
                protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)";
                public int GetSearchCount(string html)
                {
                    int result = 0;
                    string searchcount = string.Empty;

                    Regex regex = new Regex(resultPattern);
                    Match match = regex.Match(html);

                    if (match.Success)
                    {
                        searchcount = match.Value;
                    }
                    else
                    {
                        searchcount = "0";
                    }

                    if (searchcount.IndexOf(",") > 0)
                    {
                        searchcount = searchcount.Replace(",", string.Empty);
                    }

                    int.TryParse(searchcount, out result);

                    return result;
                }

                public List<Keyword> GetKeywords(string html, string word)
                {
                    int i=1;
                    List<Keyword> keywords = new List<Keyword>();

                    Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{1,2}|100)/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                    //Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                    Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);

                    MatchCollection mcTable = regTable.Matches(html);
                    foreach (Match mTable in mcTable)
                    {
                        if (mTable.Success)
                        {
                            Match mA = regA.Match(mTable.Value);
                            if (mA.Success)
                            {

                                Keyword keyword = new Keyword();
                                keyword.ID=i++;
                                keyword.Link = mA.Groups["link"].Value;
                                keyword.Title = mA.Groups["title"].Value;
                                keywords.Add(keyword);
                            }
                        }
                    }

                    return keywords;
                }
            }
            class googleSearch
            {
                public List<Keyword> GetKeywords(string html, string word)
                {
                    int i = 1;
                    List<Keyword> keywords = new List<Keyword>();

                    Regex regTable = new Regex(@"(?is)<h3[^>]*?>(?><h3[^>]*>(?<o>)|</h3>(?<-o>)|(?:(?!</?h3/b).)*)*(?(o)(?!))</h3>", RegexOptions.IgnoreCase);
                    //Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                    Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);

                    MatchCollection mcTable = regTable.Matches(html);
                    foreach (Match mTable in mcTable)
                    {
                        if (mTable.Success)
                        {
                            Match mA = regA.Match(mTable.Value);
                            if (mA.Success)
                            {

                                Keyword keyword = new Keyword();
                                keyword.ID = i++;
                                keyword.Link = mA.Groups["link"].Value;
                                keyword.Title = mA.Groups["title"].Value;
                                keywords.Add(keyword);
                            }
                        }
                    }

                    return keywords;
                }
            }
            class Keyword
            {
                public int ID { get; set; }
                public string Title { get; set; }
                public string Link { get; set; }
                //private string title;
                //public string Title { get { return title; } set { title = value; } }
                //private string link;
                //public string Link { get { return link; } set { link = value; } }
            }
        }
    }
    HttpState:

    using System.Net;
    using System.Collections;

    namespace httpState
    {
        public class HttpState
        {

            // 获取与响应一起返回的状态说明。
            private string _statusDescription;

            public string StatusDescription
            {
                get { return _statusDescription; }
                set { _statusDescription = value; }
            }

            /// <summary>
            /// 回调 址址, 登陆测试中使用
            /// </summary>
            private string _callBackUrl;

            public string CallBackUrl
            {
                get { return _callBackUrl; }
                set { _callBackUrl = value; }
            }


            /// <summary>
            /// 网页网址 绝对路径格式
            /// </summary>
            private string _url;

            public string Url
            {
                get { return _url; }
                set { _url = value; }
            }

            /// <summary>
            /// 字符串的形式的Cookie信息
            /// </summary>
            private string _cookies;

            public string Cookies
            {
                get { return _cookies; }
                set { _cookies = value; }
            }

            /// <summary>
            /// Cookie信息
            /// </summary>
            private CookieContainer _cookieContainer = new CookieContainer();

            public CookieContainer CookieContainer
            {
                get { return _cookieContainer; }
                set { _cookieContainer = value; }
            }

            /// <summary>
            /// 网页源码
            /// </summary>
            private string _html;

            public string Html
            {
                get { return _html; }
                set { _html = value; }
            }

            /// <summary>
            /// 验证码临时文件(绝对路径)
            /// </summary>
            private string _tmpValCodePic;

            public string TmpValCodePic
            {
                get { return _tmpValCodePic; }
                set { _tmpValCodePic = value; }
            }

            /// <summary>
            /// 验证码临时文件名(相对路径)
            /// </summary>
            private string _tmpValCodeFileName = "emptyPic.gif";

            public string TmpValCodeFileName
            {
                get { return _tmpValCodeFileName; }
                set { _tmpValCodeFileName = value; }
            }

            /// <summary>
            /// 有验证码
            /// </summary>
            private bool _isValCode;

            public bool IsValCode
            {
                get { return _isValCode; }
                set { _isValCode = value; }
            }

            /// <summary>
            /// 验证码URL
            /// </summary>
            private string _valCodeURL;

            public string ValCodeURL
            {
                get { return _valCodeURL; }
                set { _valCodeURL = value; }
            }

            /// <summary>
            /// 验证码识别后的值
            /// </summary>
            private string _valCodeValue;

            public string ValCodeValue
            {
                get { return _valCodeValue; }
                set { _valCodeValue = value; }
            }

            /// <summary>
            /// 其它参数
            /// </summary>
            private Hashtable _otherParams = new Hashtable();

            public Hashtable OtherParams
            {
                get { return _otherParams; }
                set { _otherParams = value; }
            }

            // 重复添加处理 add by fengcj  09/11/19 PM
            public void addOtherParam(object key, object value)
            {
                if (!this.OtherParams.ContainsKey(key))
                    this.OtherParams.Add(key, value);
                else
                {
                    this.OtherParams[key] = value;
                }
            }

            public void removeOtherParam(object key)
            {
                this.OtherParams.Remove(key);
            }

            public object getOtherParam(object key)
            {
                return this.OtherParams[key];
            }
        }
    }

     界面很简单一个输入框,两个搜索按钮和一个datagridview



    分享到:
    上一篇:用HTML的方式实现IE浏览器的菜单命令集锦
    下一篇:如何在Windows 8上安装配置IIS8.0的环境
     

    本站业务:聊城网站建设-聊城网站制作-聊城做网站