The website crawler sample is a basic implementation in C# for a website crawler that starts at a specific URL and follows the links to a certain depth. It also shows how to process different tasks while crawling. In this case it sends a status Email after a certain time to report the progress. The focus of this sample is to show how to schedule arbitrary tasks and process the results.
CrawlerEngine
(CrawlPageTask
, SendMailTask
)The Scheduler uses a worker threads to feed the CrawlerEngine
with tasks and process the task results.
private void WorkerThreadFunction() { var lastStatusMail = DateTime.UtcNow; while (!this.shutdown) { var doneWork = false; // Retrieval of new pages if (this.pagesToCrawl.Count > 0 && this.engine.NeedsInput) { doneWork = true; var startedPages = new List<PageRequest>(); foreach (var page in this.pagesToCrawl) { startedPages.Add(page); this.engine.AddTask( new CrawlPageTaskRequest { Url = page.Url, Depth = page.Depth }); // Let the engine control how much input is needed. This reduces the memory footprint (a little) if (!this.engine.NeedsInput) { break; } } foreach (var page in startedPages) { this.pagesToCrawl.Remove(page); this.scheduledPages.Add(page.Url); } } // Process finished tasks var taskResults = this.engine.GetFinishedTaskResults(); if (taskResults.Length > 0) { doneWork = true; foreach (var task in taskResults) { if (!task.Success) { Console.WriteLine("Task Failed"); // TODO: Implement error handling continue; } // We have a crawl page result var crawlPageResult = task as CrawlPageTaskResult; if (crawlPageResult != null) { this.results.Add(new PageData { Url = crawlPageResult.Url, Depth = crawlPageResult.Depth, Title = crawlPageResult.Title }); if (crawlPageResult.Depth < this.MaxDepth) { foreach (var link in crawlPageResult.Links) { if (this.baseUrl.IsBaseOf(new Uri(link)) && !this.scheduledPages.Contains(link)) { // Only add pages until we reach the MaxPages limit if (this.pagesToCrawl.Count + this.scheduledPages.Count < this.MaxPages) { this.pagesToCrawl.Add(new PageRequest { Url = link, Depth = crawlPageResult.Depth + 1 }); } else { break; } } } } } // we have a send mail result var sendMailResult = task as SendMailTaskResult; if (sendMailResult != null) { Console.WriteLine("Status mail was sent"); } } } var statusMessage = string.Format("Pages: New({0}), Scheduled({1}), Results({2})", this.pagesToCrawl.Count, this.scheduledPages.Count, this.results.Count); Console.WriteLine(statusMessage); // Starting a status mail task if (!string.IsNullOrEmpty(this.SmtpHost) && lastStatusMail.AddSeconds(60) < DateTime.UtcNow) { var message = new MailMessage(this.MailFrom, this.MailTo, "Crawler-Lib Website Crawler Sample", statusMessage); this.engine.AddTask(new SendMailTaskRequest { Host = this.SmtpHost, Port = this.SmtpPort, User = this.SmtpUser, Password = this.SmtpPassword, Message = message }); lastStatusMail = DateTime.UtcNow; } if (this.pagesToCrawl.Count == 0 && this.engine.IsIdle) { this.isIdle = true; this.idleEvent.Set(); } if (!doneWork) { // Only if nothing is processed go to sleep, otherwise continue work to gain maximum throughput Thread.Sleep(500); } } }
The CrawlPageTask implements a Limiter controlled HttpRequest workflow. The Limit workflow element encapsualtes the HttpRequest and allows to control the parallel requests and the throughput.
public override async void StartWork() { // You must always assign TaskResult on top of the StartWork() method , because without a TaskResult no exception can be returned in case of an error base.TaskResult = new CrawlPageTaskResult(); this.TaskResult.Url = this.TaskRequest.Url; this.TaskResult.Depth = this.TaskRequest.Depth; // This limit can prevent the server to be overflooded with requests. The limit is controlled by a limiter await new Limit( "PageRequestLimiter", async limited => { // The request to retrieve the URL. var request = await new HttpRequest(new Uri(this.TaskRequest.Url)); this.TaskResult.Links = new List<string>(); var titleNode = request.Html.DocumentNode.SelectSingleNode("/head/title"); if (titleNode != null) { this.TaskResult.Title = titleNode.InnerText; } var nodes = request.Html.DocumentNode.SelectNodes("//a[@href]"); if (nodes != null) { foreach (var href in nodes.Select(node => node.Attributes["href"].Value)) { Uri newUri; if (href != null && Uri.TryCreate(request.Url, href, out newUri)) { this.TaskResult.Links.Add(newUri.AbsoluteUri); } } } }); }
The workflow of the SendMailTask
shows how arbitrary work can be integrated in a task workflow. Many libraries provide a TPL task as result of a async operation. The Work
workflow element integrates this into the task workflow and allows to continue after this async work is done. You can control the workflow with all workflow elements like Retry
, Group
, Delay
… This allows a direct integration of all async patterns in the task processing.
public override async void StartWork() { // You must always assign TaskResult on top of the StartWork() method , because without a TaskResult no exception can be returned in case of an error base.TaskResult = new SendMailTaskResult(); this.TaskResult.Message = this.TaskRequest.Message; var client = new SmtpClient(this.TaskRequest.Host, this.TaskRequest.Port); client.Credentials = new NetworkCredential(this.TaskRequest.User, this.TaskRequest.Password); // The 'Work' workflow element integrates arbitrary work into the workflow. // You can't await a task directly, because the end of the task couldn't be synchronized with the workflow again. await new Work(client.SendMailAsync(this.TaskRequest.Message)); }