boombox_words = ['crackle', 'crash', 'bodyfall', 'boing', 'boom', 'buzz', 'chomp', 'click', 'creak', 'flutter', 'glug', 'groan', 'honk', 'ahoogah', 'jingle', 'neigh', 'poof', 'pop', 'puff', 'rattle', 'ribbit', 'quack', 'rustle', 'rumble', 'scream', 'screech', 'skid', 'slurp', 'splash', 'splat', 'splatter', 'squawk', 'squeak', 'squish', 'swish', 'swoosh', 'thunk', 'twang', 'whip crack', 'whoosh', 'woof', 'yelp', 'zap']
print (type(boombox_words))
words = boombox_words + sound_words +soundwords
word = list(dict.fromkeys(words))
len(word)
import PyPDF2
Frn451 = open('Fahrenheit 451.pdf', 'rb')
Fahrenheit451 = PyPDF2.PdfFileReader(Frn451)
Fahrenheit451.numPages
page1 = Fahrenheit451.getPage(0)
page1.extractText() #the first PDF couldn't be read
FRN451 = open('F451.pdf', 'rb')
F451 = PyPDF2.PdfFileReader(FRN451)
pg1 = F451.getPage(0)
pg1.extractText()
pdf = open('F451.pdf', 'rb')
#create a loop
F451 = PyPDF2.PdfFileReader(pdf)
numOfPages = F451.getNumPages()
for i in range(0, numOfPages):
print("Page Number: " + str(i))
print("- - - - - - - - - - - - - - - - - - - -")
pageObj = F451.getPage(i)
print(pageObj.extractText())
print("- - - - - - - - - - - - - - - - - - - -")
pdf.close() #looked relatively normal, but wasn't comparable between the list of sound words
#the loop that finally worked
F451 = PyPDF2.PdfFileReader(FRN451)
num_pages = F451.numPages
count = 0
text = ""
while count < num_pages:
pageObj = F451.getPage(count)
count +=1
text += pageObj.extractText()
if text != "":
text = text
else:
text = textract.process(fileurl, method='tesseract', language='eng')
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
tokens = word_tokenize(text)
punctuations = ['(',')',';',':','[',']',',', '.', '\n\nby', '\n\n', '\n']
keywords = [word for word in tokens if not word in punctuations]
len(set(keywords) & set(word))