Machine Learning

Calculating cosine similarity between documents

This script calculates the cosine similarity between several text documents. At scale, this method can be used to identify similar documents within a larger corpus.

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# Bring in standard stopwords
stopWords = stopwords.words('english')

print ("\nCalculating document similarity scores...")

# Open and read a bunch of files 
f = open('/Users/Daniel/Documents/Development/Transcript-Data/Test/admin/0102HD41.txt')
doc1 = str(

f = open('/Users/Daniel/Documents/Development/Transcript-Data/Test/admin/0107HD42.txt')
doc2 = str(

f = open('/Users/Daniel/Documents/Development/Transcript-Data/Test/admin/0107HD40.txt')
doc3 = str(

# Create a string to use to test the similarity scoring

train_string = 'By these proceedings for judicial review the Claimant seeks to challenge the decision of the Defendant dated the 23rd of May 2014 refusing the Claimant’s application of the 3rd of January 2012 for naturalisation as a British citizen'

# Construct the training set as a list
train_set = [train_string, doc1, doc2, doc3]

# Set up the vectoriser, passing in the stop words
tfidf_vectorizer = TfidfVectorizer(stop_words=stopWords)

# Apply the vectoriser to the training set
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)

# Print the score
print ("\nSimilarity Score [*] ",cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train))<p>Hello, World!</p>

Naive Bayes Document Classifier with Scikit-Learn

The following code demonstrates a relatively simple example of a Naive Bayes classifier applied to a small batch of case law.

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn import datasets
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn import svm

# Declare the categories
categories = ['Crime', 'Family']

# Load the dataset
docs_to_train = sklearn.datasets.load_files("/Users/danielhoadley/Documents/Development/Python/Test_Data", description=None, categories=categories,
                                            load_content=True, shuffle=True, encoding='utf-8', decode_error='strict', random_state=0)

train_X, test_X, train_y, test_y = train_test_split(,
                               test_size = 3)
print (len(

print (train_X)

# Vectorise the dataset

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(

# Fit the estimator and transform the vector to tf-idf

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Train the naive Bayes classifier

clf = MultinomialNB().fit(X_train_tfidf,

docs_new = ['The defendant used a knife.', 'This court will protect vulnerable adults', 'The appellant was sentenced to seven years']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

# Print the results

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, docs_to_train.target_names[category]))

This renders the following output:

'The defendant used a knife.' => Crime
'This court will protect vulnerable adults' => Family
'The appellant was sentenced to seven years' => Crime