This little sample demonstrates several basic text processing steps with a corpus of text files stored in a local directory.
- First, we read the corpus of text files into a list
- Second, we knock out unwanted stuff, like things that aren't actually words and words that only consist of a single character
- Third, we use standard NLTK stopwords and a list of custom stopwords, to strip out noise from the corpus
- Finally, we use NLTK to calculate the most common words in each file in the corpus
from __future__ import division import glob from nltk.corpus import stopwords from nltk import * import re # Bring in the default English NLTK stop words stoplist = stopwords.words('english') # Define additional stopwords in a string additional_stopwords = """case law lawful judge judgment court mr justice would evidence mr order defendant act decision make two london appeal section lord one applicant mr. may could said also application whether made time first r miss give appellant november give fact within point sentence question upon matter leave part given must notice public state taken course cannot circumstances j that, offence set behalf however plaintiff see set say secretary regard - v claim right appeared second put e way material view relation effect take might particular however, present court) october b reasons basis far referred trial found lord, land consider authority subject necessary considered 0171 see,s council think legal shall respect ground three case, crown without 2 relevant and, special business told clear paragraph person account letter therefore jury th solicitor use years mrs mr provision discretion matters respondent concerned cases defence reason issue well count argument facts gave proceedings position period needs approved used power us limited even either exercise counsel applicants submission although counsel submitted st need appellants plaintiffs policy thomas making tribunal action entitled affadavit december strand daniel transcript smith purpose refused offence offences general counts terms grounds conclusion number reasonable prosecution home hearing seems defendants educational clarke solicitors criminal following accept place come already accepted required words local l;ater january provided stage report street september day sought greenwood rather service accounts page hobhouse courts march third wilcock mind result months came learned appropriate date instructed form division notes july went bernal official review principle consideration affidavit held lordship another dr different notes quite royal possible instructed shorthand development amount has months wc respondents took clearly since find satisfied members later fleet took interest parties name change information co sum ec done provisions party hd paid """ # Split the the additional stopwords string on each word and then add # those words to the NLTK stopwords list stoplist += additional_stopwords.split() # Define the files that make up the corpus to be modelled file_list = glob.glob(os.path.join(os.getcwd(), '/Users/danielhoadley/PycharmProjects/topicvis', '*.txt')) # Construct an empty list into which the content of each file will be stored as a item corpus =  # Read the files for file_path in file_list: with open(file_path) as f_input: content = f_input.read() only_words = re.sub("[^a-zA-Z]", " ", content) # Remove anything that isn't a 'word' no_single = re.sub(r'(?:^| )\w(?:$| )', ' ', only_words).strip() # Remove any words consisting of a single character corpus.append(no_single) f_input.close() # Remove stopwords texts = [[word for word in document.lower().split() if word not in stoplist] for document in corpus] # Get the most common words in each text for text in texts: fdist = FreqDist(text) print (fdist.most_common(2))