Calculating cosine similarity between documents

This script calculates the cosine similarity between several text documents. At scale, this method can be used to identify similar documents within a larger corpus.

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# Bring in standard stopwords
stopWords = stopwords.words('english')

print ("\nCalculating document similarity scores...")

# Open and read a bunch of files 
f = open('/Users/Daniel/Documents/Development/Transcript-Data/Test/admin/0102HD41.txt')
doc1 = str(f.read())

f = open('/Users/Daniel/Documents/Development/Transcript-Data/Test/admin/0107HD42.txt')
doc2 = str(f.read())

f = open('/Users/Daniel/Documents/Development/Transcript-Data/Test/admin/0107HD40.txt')
doc3 = str(f.read())

# Create a string to use to test the similarity scoring

train_string = 'By these proceedings for judicial review the Claimant seeks to challenge the decision of the Defendant dated the 23rd of May 2014 refusing the Claimant’s application of the 3rd of January 2012 for naturalisation as a British citizen'

# Construct the training set as a list
train_set = [train_string, doc1, doc2, doc3]

# Set up the vectoriser, passing in the stop words
tfidf_vectorizer = TfidfVectorizer(stop_words=stopWords)

# Apply the vectoriser to the training set
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)

# Print the score
print ("\nSimilarity Score [*] ",cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train))<p>Hello, World!</p>