Saturday, August 14, 2010

How to build a simple web spider using asp.net

Trying to runing a WebSpider in a web application is not a good idea. Because multi threaded apartment lead us to fire up a sperate thread of each url, which is not the most memory efficient method. But for those who has such crazy ideas here is an example.


It is not a very big deal to implment a simple web spider which find dirty links. Following code lists all the lists with their status (Fail or OK) but it takes long 5-6 minutes to process average site up to 10 links in depth. So we have to leave execution in to a seperate thered as page execution will timeout if we wait for indexing to finish.
Web spider code
public class WebSpider
{
    const int LIMIT = 10;
    string[] invalidTypes = { ".zip"".doc"".css"".pdf"".xls"".txt"".js"".ico" };
    public List<Link> Links;
    public bool IsRunning { getset; }
    public WebSpider()
    {
        this.Links = new List<Link>();
    }
    public void Execute(string url)
    {
        this.Links.Clear();
        this.Links.Add(new Link() { Status = HttpStatusCode.OK, NavigateUrl = url });
        this.IsRunning = true;
        WaitCallback item = delegate(object state) { this.FindLinks((UrlState)state); };
        ThreadPool.QueueUserWorkItem(item, new UrlState() { Url = url, Level = 0 });
    }
    public void FindLinks(UrlState state)
    {
        try
        {
            string html = new WebClient().DownloadString(state.Url);
            MatchCollection matches = Regex.Matches(html, "href[ ]*=[ ]*['|\"][^\"'\r\n]*['|\"]");
            foreach (Match match in matches)
            {
                string value = match.Value;
                value = Regex.Replace(value, "(href[ ]*=[ ]*')|(href[ ]*=[ ]*\")"string.Empty);
                if (value.EndsWith("\"") || value.EndsWith("'"))
                    value = value.Remove(value.Length - 1, 1);
                if (!Regex.Match(value, @"\((.*)\)").Success)
                {
                    if (!value.Contains("http:"))
                    {
                        Uri baseUri = new Uri(state.Url);
                        Uri absoluteUri = new Uri(baseUri, value);
                        value = absoluteUri.ToString();
                    }
                    if (this.Links.Exists(x => x.NavigateUrl.Equals(value))) continue;
                    try
                    {
                        bool validLink = true;
                        foreach (string invalidType in invalidTypes)
                        {
                            string v = value.ToLower();
                            if (v.EndsWith(invalidType) || v.Contains(string.Format("{0}?", invalidType)))
                            {
                                validLink = false;
                                break;
                            }
                        }
                        if (validLink)
                        {
                            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(value);
                            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                            this.Links.Add(new Link() { Status = response.StatusCode, NavigateUrl = value });
                            if (response.StatusCode == HttpStatusCode.OK && state.Level < LIMIT)
                            {
                                WaitCallback item = delegate(object s) { FindLinks((UrlState)s); };
                                ThreadPool.QueueUserWorkItem(
                                    item, new UrlState() { Url = value, Level = state.Level + 1 });
                            }
                        }
                    }
                    catch
                    {
                        this.Links.Add(new Link()
                        {
                            Status = HttpStatusCode.ExpectationFailed,
                            NavigateUrl = value
                        });
                    }
                }
 
            }
        }
        catch
        {
            ///
            /// If downloading times out, just ignore...
            /// 
        }
    }
}
public class Link : HyperLink
{
    public HttpStatusCode Status { getset; }
}
public class UrlState
{
    public string Url { getset; }
    public int Level { getset; }
    public UrlState()
    {
        this.Level = 0;
        this.Url = string.Empty;
    }
}
Page code
public partial class Test : Page
{
    private string script = @"setTimeout(""__doPostBack('{0}','')"", 5000);";
    private WebSpider WebSpider
    {
        get { return (WebSpider)(Session["webSpider"] ?? (Session["webSpider"] = new WebSpider())); }
    }
    protected override void OnLoad(EventArgs e)
    {
        base.OnLoad(e);
        if (!this.WebSpider.IsRunning)
        {
            string url = "http://www.charith.gunasekara.web-sphere.co.uk";
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK)
                this.WebSpider.Execute(url);
        }
        if (this.WebSpider.IsRunning)
        {
            this.lblStatus.Text = "Processing...";
            ScriptManager.RegisterStartupScript(thisthis.GetType(),
                this.GetType().Name, string.Format(script, this.btnUpdateResults.ClientID), true);
        }
        this.grvLinks.DataSource = this.WebSpider.Links;
        this.grvLinks.DataBind();
    }
}
Markup
<asp:ScriptManager runat="server" ID="PageScriptManager" />
<p>Listing all the links for site: <strong>http://www.charith.gunasekara.web-sphere.co.uk</strong></p>  
<p>Width the maximum depth of <strong>10</strong></p>
<hr />
<asp:Label runat="server" ID="lblStatus" />
<asp:UpdatePanel runat="server" ID="pnlLinks">
    <ContentTemplate>
        <asp:GridView runat="server" ID="grvLinks" AutoGenerateColumns="false">
            <Columns>
                <asp:BoundField DataField="NavigateUrl" HeaderText="Url" />
                <asp:ButtonField DataTextField="Status" HeaderText="Status" />
            </Columns>
        </asp:GridView>
        <asp:Button runat="server" ID="btnUpdateResults" Text="Update Results" />
    </ContentTemplate>
</asp:UpdatePanel>

No comments:

Azure Storage Account Types

Defferent Types of Blobs Block blobs store text and binary data. Block blobs are made up of blocks of data that can be managed individually...