Saturday, April 18, 2009

A simple web crawler

it is becoming difficult to write more about what I have been reading/researching related to Information Retrieval but IR continues to be my area of interest. There is tons of literature out there and each research paper that I read brings with it more insight about IR. That made me want to test the various theories and algorithms mentioned in those papers. And to test them, first thing that I needed was data (web documents to be precise). Now, crawling is in itself a very important area of research. One could opt to write a very efficient crawler but I am a full-time student who works part-time to pay his bills and on top of that I have to start working on my dissertation as well. So there was no way I could allocate more time writing a crawler in order to test the various theories.

Instead I chose to write a very simple crawler. A simple crawler could just make use of the link structure of the web. And since I am not the only one who thinks that or does that, I thought it is a good step to start with. What I really needed was to extract < a > tags from a given web page. The following regular expression allowed me to do that:

Regex extractTags = new Regex(@"<" + tag + @"[^>]*?HREF\s*=\s*[""']?([^'"" >]+?)[ '""]?>", RegexOptions.IgnoreCase | RegexOptions.Compiled);


The main function of the crawler that I wrote loops through the list of predefined pages, calling addtoindex function on each one. The function can be used to store links and their text but for the sake of this post it just prints the URL. It then uses the regular expression that I mentioned above to get all the links on that page and adds their URLs to a set called newpages. At the end of the loop, newpages becomes pages, and the process repeats.

Here is the complete code in C#:


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Net;
using System.IO;

namespace Crawler
{
class Program
{
private static String strText;
static MatchCollection tagCollection;
public static HttpWebRequest req;
public static HttpWebResponse res;
static Stream resStream;
public static string baseUrl;

static void Main(string[] args)
{
//add the specific site that you want to crawl
baseUrl = "http://www.techcrunch.com/";

ArrayList pages = new ArrayList();
pages.Add(baseUrl);

//start crawling
crawl(pages, 20);

Console.WriteLine("/nIndexing Complete!!");
Console.ReadLine();
}

public static void crawl(ArrayList pages, int depth)
{
MatchCollection mc;
ArrayList links = new ArrayList();

//Breadth-first search algorithm to crawl the pages and collect links
for (int i = 0; i < depth; i++)
{
ArrayList newpages = new ArrayList();

foreach (String page in pages)
{
try
{
if (isValidUrl(page))
{
urlOpen();
}
}
catch (Exception ex)
{
System.Console.WriteLine("Couldnot open {0} because {1}", page, ex.ToString());
continue;
}

string pagecontent = read();

//adding the page in the index
addtoindex(page, pagecontent);

mc = tagList(pagecontent, "a");

links = getAttributeValue(mc, "href", baseUrl);

foreach (string link in links)
{
String url, linktext;
url = linktext = null;


if (link.Contains("#"))
{
try
{
url = link;

}
catch (Exception ex)
{
Console.WriteLine("Error in Crawl " + ex.Message + " - " + url);
}
}
else
{
url = link;
}

try
{
if ((url.Substring(0, 4) == "http") && (isindexed(url) == false))
{

newpages.Add(url);
}

}
catch (Exception ex)
{
Console.WriteLine("Couldnot add new page " + url + " b/c {0}", ex.ToString());
}
linktext = gettextonly(pagecontent);
}

}
pages = newpages;

}
}

//Returns false for now, but can be modified to query a database to check whether a page has already been indexed
public static bool isindexed(string url)
{
return false;
}

//Add page to the index, this is where a database or file system can be used
public static void addtoindex(string url, string pagecontent)
{

Console.WriteLine("Indexing : " + url);

}

//Get the collection of < a > tags in a page
public static MatchCollection tagList(String HTMLcontent, String tag)
{

Regex extractTags = new Regex(@"<" + tag + @"[^>]*?HREF\s*=\s*[""']?([^'"" >]+?)[ '""]?>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
try
{
tagCollection = extractTags.Matches(HTMLcontent);

return tagCollection;
}
catch (Exception ex)
{
Console.WriteLine(ex.ToString());
}
return null;
}

//Gets the HREF value from each <> tag
public static ArrayList getAttributeValue(MatchCollection mc, String Attr, string url)
{
ArrayList links = new ArrayList();//{ ""};

foreach (Match match in mc)
{

string temp = match.Value;

try
{
if (temp.Contains("http"))
{
links.Add(temp.Substring(temp.IndexOf("href") + 6, temp.LastIndexOf(">") - temp.IndexOf("http") - 1));

}

else if (temp.Contains("://"))
{
links.Add(temp.Substring(temp.IndexOf("href") + 6, temp.LastIndexOf(">") - (temp.IndexOf("href") + 7)));
}
else
{
string strTemp = temp.Substring(temp.IndexOf("href") + 6, temp.LastIndexOf(">") - (temp.IndexOf("href") + 7));
url.Replace("\n\r", "");
if (strTemp[0] != '/' && url[url.Length - 1] != '/')
{
strTemp = url + "/" + strTemp;
}
else
{
strTemp = url + strTemp;
}
links.Add(strTemp);
}

}
catch (Exception ex)
{
Console.WriteLine("Error in GetAttributes :" + ex.Message + " - " + url);
}

}
return links;

}

//reads that content of a web page
public static string gettextonly(string pagecontent)
{

string pattern = @"<(.|\n)*?>";
return Regex.Replace(pagecontent, pattern, String.Empty);

}


public static String read()
{
StreamReader sr = new StreamReader(resStream);
strText = sr.ReadToEnd();
return strText;
}

public static void urlOpen()
{
resStream = res.GetResponseStream();
}

public static bool isValidUrl(String url)
{
try
{
req = (HttpWebRequest)HttpWebRequest.Create(url);
res = (HttpWebResponse)req.GetResponse();
return (res.StatusCode == HttpStatusCode.OK);
}
catch (Exception ex)
{
Console.WriteLine("Error in ISValidURL " + ex.Message + " - " + url);
return false;
}

}
}
}


Any suggestions are welcomed.