It is not a very big deal to implment a simple web spider which find dirty links. Following code lists all the lists with their status (Fail or OK) but it takes long 5-6 minutes to process average site up to 10 links in depth. So we have to leave execution in to a seperate thered as page execution will timeout if we wait for indexing to finish.
Web spider code
public class WebSpider { const int LIMIT = 10; string[] invalidTypes = { ".zip", ".doc", ".css", ".pdf", ".xls", ".txt", ".js", ".ico" }; public List<Link> Links; public bool IsRunning { get; set; } public WebSpider() { this.Links = new List<Link>(); } public void Execute(string url) { this.Links.Clear(); this.Links.Add(new Link() { Status = HttpStatusCode.OK, NavigateUrl = url }); this.IsRunning = true; WaitCallback item = delegate(object state) { this.FindLinks((UrlState)state); }; ThreadPool.QueueUserWorkItem(item, new UrlState() { Url = url, Level = 0 }); } public void FindLinks(UrlState state) { try { string html = new WebClient().DownloadString(state.Url); MatchCollection matches = Regex.Matches(html, "href[ ]*=[ ]*['|\"][^\"'\r\n]*['|\"]"); foreach (Match match in matches) { string value = match.Value; value = Regex.Replace(value, "(href[ ]*=[ ]*')|(href[ ]*=[ ]*\")", string.Empty); if (value.EndsWith("\"") || value.EndsWith("'")) value = value.Remove(value.Length - 1, 1); if (!Regex.Match(value, @"\((.*)\)").Success) { if (!value.Contains("http:")) { Uri baseUri = new Uri(state.Url); Uri absoluteUri = new Uri(baseUri, value); value = absoluteUri.ToString(); } if (this.Links.Exists(x => x.NavigateUrl.Equals(value))) continue; try { bool validLink = true; foreach (string invalidType in invalidTypes) { string v = value.ToLower(); if (v.EndsWith(invalidType) || v.Contains(string.Format("{0}?", invalidType))) { validLink = false; break; } } if (validLink) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(value); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); this.Links.Add(new Link() { Status = response.StatusCode, NavigateUrl = value }); if (response.StatusCode == HttpStatusCode.OK && state.Level < LIMIT) { WaitCallback item = delegate(object s) { FindLinks((UrlState)s); }; ThreadPool.QueueUserWorkItem( item, new UrlState() { Url = value, Level = state.Level + 1 }); } } } catch { this.Links.Add(new Link() { Status = HttpStatusCode.ExpectationFailed, NavigateUrl = value }); } } } } catch { /// /// If downloading times out, just ignore... /// } } } public class Link : HyperLink { public HttpStatusCode Status { get; set; } } public class UrlState { public string Url { get; set; } public int Level { get; set; } public UrlState() { this.Level = 0; this.Url = string.Empty; } }Page code
public partial class Test : Page { private string script = @"setTimeout(""__doPostBack('{0}','')"", 5000);"; private WebSpider WebSpider { get { return (WebSpider)(Session["webSpider"] ?? (Session["webSpider"] = new WebSpider())); } } protected override void OnLoad(EventArgs e) { base.OnLoad(e); if (!this.WebSpider.IsRunning) { string url = "http://www.charith.gunasekara.web-sphere.co.uk"; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) this.WebSpider.Execute(url); } if (this.WebSpider.IsRunning) { this.lblStatus.Text = "Processing..."; ScriptManager.RegisterStartupScript(this, this.GetType(), this.GetType().Name, string.Format(script, this.btnUpdateResults.ClientID), true); } this.grvLinks.DataSource = this.WebSpider.Links; this.grvLinks.DataBind(); } }Markup
<asp:ScriptManager runat="server" ID="PageScriptManager" /> <p>Listing all the links for site: <strong>http://www.charith.gunasekara.web-sphere.co.uk</strong></p> <p>Width the maximum depth of <strong>10</strong></p> <hr /> <asp:Label runat="server" ID="lblStatus" /> <asp:UpdatePanel runat="server" ID="pnlLinks"> <ContentTemplate> <asp:GridView runat="server" ID="grvLinks" AutoGenerateColumns="false"> <Columns> <asp:BoundField DataField="NavigateUrl" HeaderText="Url" /> <asp:ButtonField DataTextField="Status" HeaderText="Status" /> </Columns> </asp:GridView> <asp:Button runat="server" ID="btnUpdateResults" Text="Update Results" /> </ContentTemplate> </asp:UpdatePanel>
No comments:
Post a Comment