Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 48 additions & 9 deletions frequency.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
""" Analyzes the word frequencies in a book downloaded from
Project Gutenberg """
"""
@ilya-besancon
Analyzes the word frequencies in a book downloaded from
Project Gutenberg.
In this case, I look at Martin Eden.
"""

import string
import re


def get_word_list(file_name):
def get_word_list(file_name, n):
""" Reads the specified project Gutenberg book. Header comments,
punctuation, and whitespace are stripped away. The function
returns a list of the words used in the book as a list.
All words are converted to lower case.
"""
pass
# read file:
book = open(file_name, 'r')
allwords = book.read()
# return as list of words:
word_list = re.compile('\w+').findall(allwords)
# makes all words lowercase
word_list = [str.lower(word) for word in word_list]
# print('Testing full list: ', word_list[:150])
return get_top_n_words(word_list, n)


def get_top_n_words(word_list, n):
Expand All @@ -21,10 +33,37 @@ def get_top_n_words(word_list, n):
punctuation
n: the number of words to return
returns: a list of n most frequently occurring words ordered from most
frequently to least frequentlyoccurring
frequently to least frequently occurring
"""
pass
# initializing dictionary and lists of words and their frequencies
d = dict()
words = []

# adds one of each word to list
for word in word_list:
if word not in words:
words.append(word)
# adds count of each word to a dictionary
for word in word_list:
count = d.get(word, 0)
d[word] = count + 1
# reverse key/values of dictionary
new_dict = {y: x for x, y in d.items()}
# sort by highest to lowest count
keys = sorted(new_dict.keys())
# make a list of most frequent n words
order_words = []
for key in keys:
order_words.append(new_dict.get(key))
order_words = order_words[::-1]
order_words = order_words[:n]
return order_words


if __name__ == "__main__":
print("Running WordFrequency Toolbox")
print(string.punctuation)
print("Running WordFrequency Toolbox. This might take a second.")
# number of top words:
n = 10
result = get_word_list('martineden.txt', n)
print("Top ", n, 'words: ', result)
print("Wouldn't you think the author might be more creative? Joking!")
Loading