Mining the Web: August 2009

Vector Space Model (VSM) is widely used in Information Retrieval systems. The model creates a space in which both documents and queries are represented by vectors. VSM treats the query as a document as well and then it determines similarity between query and other documents by calculating the deviation of angles between each document vector and the original query vector. It then uses the deviation to rank documents.

I implemented the model in C# using the example described in Mi Islita. The program will display elements of a dictionary object that hold the name of each document and its similarity value.

You can use any sorting algorithm to sort the dictionary by the similarity values. The program also assumes that there is a string array "docs" which contains the query's (at zero index) and the documents' text.

Here is the code:

using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Linq;
namespace VectorSpaceModel
{
class Program
{
static Hashtable DTVector = new Hashtable(); //Hashtable to hold Document Term Vector
static List wordlist = new List(); //List of terms found in documents
static Dictionary sortedList = new Dictionary(); //Documents ranked by VSM with angle value
static string[] docs ={"gold silver truck", //Query
"shipment of gold damaged in a fire",//Doc 1
"delivery of silver arrived in a silver truck", //Doc 2
"shipment of gold arrived in a truck"}; //Doc 3

static void Main(string[] args)
{
createWordList();
createVector();
classify();

var dict = sortedList;
foreach (var x in dict.Reverse())
{
Console.WriteLine("{0} -> Doc{1}", x.Key, x.Value);
}

Console.ReadLine();
}

public static void createWordList()
{
foreach (string doc in docs)
{
wordlist = getWordList(wordlist, doc);
}
}

public static List getWordList(List wordlist, string query)
{
Regex exp = new Regex("\\w+", RegexOptions.IgnoreCase);
MatchCollection MCollection = exp.Matches(query);

foreach (Match match in MCollection)
{
if (!wordlist.Contains(match.Value))
{
wordlist.Add(match.Value);
}
}

return wordlist;
}

public static void createVector()
{
double[] queryvector;

for (int j = 0; j < docs.Length; j++)
{
queryvector = new double[wordlist.Count];

for (int i = 0; i < wordlist.Count; i++)
{

double tfIDF = getTF(docs[j], wordlist[i]) * getIDF(wordlist[i]);
queryvector[i] = tfIDF;
}

if (j == 0) //is it a query?
{
DTVector.Add("Query", queryvector);
}
else
{
DTVector.Add(j.ToString(), queryvector);
}
}
}

public static void classify()
{
double temp = 0.0;

IDictionaryEnumerator _enumerator = DTVector.GetEnumerator();

double[] queryvector = new double[wordlist.Count];

Array.Copy((double[])DTVector["Query"], queryvector, wordlist.Count);

while (_enumerator.MoveNext())
{
if (_enumerator.Key.ToString() != "Query")
{
temp = cosinetheta(queryvector, (double[])_enumerator.Value);

sortedList.Add(temp, _enumerator.Key.ToString());

}
}
}

public static double dotproduct(double[] v1, double[] v2)
{
double product = 0.0;
if (v1.Length == v2.Length)
{
for (int i = 0; i < v1.Length; i++)
{
product += v1[i] * v2[i];
}
}
return product;
}

public static double vectorlength(double[] vector)
{
double length = 0.0;
for (int i = 0; i < vector.Length; i++)
{
length += Math.Pow(vector[i], 2);
}

return Math.Sqrt(length);
}
private static double getTF(string document, string term)
{
string[] queryTerms = Regex.Split(document, "\\s");
double count = 0;

foreach (string t in queryTerms)
{
if (t == term)
{
count++;
}
}
return count;

}

private static double getIDF(string term)
{
double df = 0.0;
//get term frequency of all of the sentences except for the query
for (int i = 1; i < docs.Length; i++)
{
if (docs[i].Contains(term))
{
df++;
}
}

//Get sentence count
double D = docs.Length - 1; //excluding the query

double IDF = 0.0;

if (df > 0)
{
IDF = Math.Log(D / df);
}

return IDF;
}

public static double cosinetheta(double[] v1, double[] v2)
{
double lengthV1 = vectorlength(v1);
double lengthV2 = vectorlength(v2);

double dotprod = dotproduct(v1, v2);

return dotprod / (lengthV1 * lengthV2);

}
}
}

Mining the Web

Monday, August 17, 2009

Classic Vector Space Model in C#