Na-Rae Han (naraehan@pitt.edu), 2017-07-13, Pittsburgh NEH Institute “Make Your Edition”
inaugural
folder on your desktop +
to create a new cell, ► to run Alt+ENTER
to run cell, create a new cell belowShift+ENTER
to run cell, go to next cellMore on https://www.cheatography.com/weidadeyue/cheat-sheets/jupyter-notebook/
myfile = 'C:/Users/narae/Desktop/inaugural/1789-Washington.txt' # Use your own userid; Mac users should omit C:
wtxt = open(myfile).read()
print(wtxt[:500])
import nltk # Don't forget to import nltk
%pprint # Turn off/on pretty printing (prints too many lines)
# Build a token list
wtokens = nltk.word_tokenize(wtxt)
len(wtokens) # Number of words in text
# Build a dictionary of word frequency count
wfreq = nltk.FreqDist(wtokens)
wfreq['the']
len(wfreq) # Number of unique words in text
wfreq.most_common(40) # 40 most common words
sentcount = wfreq['.'] + wfreq['?'] + wfreq['!'] # Assuming every sentence ends with ., ! or
print(sentcount)
# Tokens include symbols and punctuation. First 50 tokens:
wtokens[:50]
wtokens_nosym = [t for t in wtokens if t.isalnum()] # alpha-numeric tokens only
len(wtokens_nosym)
# Try "n't", "20th", "."
"n't".isalnum()
# First 50 tokens, alpha-numeric tokens only:
wtokens_nosym[:50]
len(wtokens_nosym)/sentcount # Average sentence length in number of words
[w for w in wfreq if len(w) >= 13] # all 13+ character words
long = [w for w in wfreq if len(w) >= 13]
# sort long alphabetically using sorted()
for w in sorted(long) :
print(w, len(w), wfreq[w]) # long words tend to be less frequent
.words()
) and sentence tokenization (.sents()
). from nltk.corpus import PlaintextCorpusReader
corpus_root = 'C:/Users/Jane Eyre/Desktop/inaugural' # Use your own userid; Mac users should omit C:
inaug = PlaintextCorpusReader(corpus_root, '.*txt') # all files ending in 'txt'
# .txt file names as file IDs
inaug.fileids()
# NLTK automatically tokenizes the corpus. First 50 words:
print(inaug.words()[:50])
# You can also specify individual file ID. First 50 words from Obama 2009:
print(inaug.words('2009-Obama.txt')[:50])
# NLTK automatically segments sentences too, which are accessed through .sents()
print(inaug.sents('2009-Obama.txt')[0]) # first sentence
print(inaug.sents('2009-Obama.txt')[1]) # 2nd sentence
# How long are these speeches in terms of word and sentence count?
print('Washington 1789:', len(inaug.words('1789-Washington.txt')), len(inaug.sents('1789-Washington.txt')))
print('Obama 2009:', len(inaug.words('2009-Obama.txt')), len(inaug.sents('2009-Obama.txt')))
# for-loop through file IDs and print out word count.
# While looping, populate fid_avsent which holds avg sent lengths.
# Break long line with \, specify tab separator.
fid_avsent = []
for f in inaug.fileids():
print(len(inaug.words(f)), len(inaug.sents(f)), \
len(inaug.words(f)) / len(inaug.sents(f)), f, sep='\t')
fid_avsent.append((len(inaug.words(f)) / len(inaug.sents(f)), f))
# Turn pretty print back on
%pprint
sorted(fid_avsent)
# same thing, with list comprehension!
fid_avsent2 = [(len(inaug.words(f)) / len(inaug.sents(f)), f) for f in inaug.fileids()]
sorted(fid_avsent2)
# Corpus size in number of words
print(len(inaug.words()))
# Building word frequency distribution for the entire corpus
inaug_freq = nltk.FreqDist(inaug.words())
inaug_freq.most_common(100)
re
is Python's regular expression module. Start by importing. re.findall
finds all substrings that match a pattern.r'...'
(rawstring) prefix. import re
sent = "You haven't seen Star Wars...?"
re.findall(r'\w+', sent)
%pprint
re.findall(r'\w+', wtxt)