Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Project Gutenberg """

import string
import operator


def get_word_list(file_name):
Expand All @@ -10,7 +11,37 @@ def get_word_list(file_name):
returns a list of the words used in the book as a list.
All words are converted to lower case.
"""
pass
#Open file
f = open(file_name, 'r')
lines = f.readlines()
curr_line = 0

#Create Dictionary
wordDict = {}
while lines[curr_line].find('START OF THIS PROJECT GUTENBERG EBOOK') == -1:
curr_line += 1
lines = lines[curr_line+1:]

#loop through lines and words
for i in lines:
line = i.split()
for j in line:

#Get rid of punctuation
word = j.strip(string.punctuation)

#If key exists
try:
#Increment
wordDict[word] += 1
#Else
except KeyError:
#Create Key
wordDict[word] = 0
#Return items in dictionary as tuple
return wordDict.items()




def get_top_n_words(word_list, n):
Expand All @@ -23,8 +54,14 @@ def get_top_n_words(word_list, n):
returns: a list of n most frequently occurring words ordered from most
frequently to least frequentlyoccurring
"""
pass
#Sort list according to second value of tuple
sortedList = sorted(word_list, key=operator.itemgetter(1))
#Reverse list
sortedList = sortedList[::-1]
#Return 1st hundred positions
return sortedList[:n]

if __name__ == "__main__":
print("Running WordFrequency Toolbox")
print(string.punctuation)
wordlist = get_word_list("pg32325.txt")
print(get_top_n_words(wordlist, 100))