diff --git a/.gitignore b/.gitignore index e77c838..53d0d79 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,4 @@ cython_debug/ *.pxd *.pdf .DS_Store +*.png diff --git a/generate_visualizations.py b/generate_visualizations.py index 0c8b5e1..1ef16cc 100644 --- a/generate_visualizations.py +++ b/generate_visualizations.py @@ -1,4 +1,5 @@ import pandas as pd +import glob import seaborn as sns import matplotlib.pyplot as plt import requests @@ -14,22 +15,12 @@ def download_white_paper_titles(): open("whitepapers.csv", "wb").write(response.content) -def download_white_paper_pdfs(): - url = 'http://nationalacademies.org/docs/DA524E6332D2E049F5FED8AB6FEA10B33B6B580AE727' - response = requests.get(url) - open("solar_helio_white_papers.pdf", "wb").write(response.content) - - url = 'https://nationalacademies.org/docs/DA27F10CA4B39D78D506D796D23AA28CF497E2189520' - response = requests.get(url) - open("atmo_iono_magneto_white_papers.pdf", "wb").write(response.content) - - url = 'https://nationalacademies.org/docs/D74AA75ADB2D476CAEC22D602FAF0FEC83E61398725A' - response = requests.get(url) - open('solar_wind_magneto_white_papers.pdf', 'wb').write(response.content) - - url = 'https://nationalacademies.org/docs/D67F14C61DF510CE8E01E269D4B5B789A678C161036C' - response = requests.get(url) - open('general_white_papers.pdf', 'wb').write(response.content) +def download_white_paper_pdfs(df_titles): + for i, url in enumerate(df_titles['1:Upload White Paper  '].values): + response = requests.get(url) + title = df_titles['White Paper Title'].values[i] + title = title.replace("/","-") + open('white_papers/' + title + '.pdf', 'wb').write(response.content) def read_white_paper_titles(): @@ -42,22 +33,13 @@ def read_white_paper_titles(): def read_white_paper_pdfs(): - reader = PdfReader('solar_helio_white_papers.pdf') - text = '' - for page in reader.pages: - text+=page.extract_text() - - reader = PdfReader('atmo_iono_magneto_white_papers.pdf') - for page in reader.pages: - text+=page.extract_text() + filenames = glob.glob("white_papers/*.pdf") - reader = PdfReader('solar_wind_magneto_white_papers.pdf') - for page in reader.pages: - text+=page.extract_text() - - reader = PdfReader('general_white_papers.pdf') - for page in reader.pages: - text+=page.extract_text() + text = '' + for file in filenames: + reader = PdfReader(file) + for page in reader.pages: + text+=page.extract_text() return text @@ -86,16 +68,16 @@ def make_pie_chart(df): fig.savefig("pie.png") -def make_word_cloud(text): +def make_word_cloud(text, shape='fas fa-sun'): stop_words = get_stop_words('english') stop_words.extend(list(string.ascii_lowercase)) - stop_words.extend(['et al', 'et', 'al', 'et al.', 'physic', 'geophys', 'doi', 'two', 'thu', 'space physic', 'res lett', 'provide', 'can', 'th', 'de', 'also', 're', 'res', 'lett', 'res lett', 'will', 'however']) + stop_words.extend(['et al', 'et', 'al', 'et al.', 'physic', 'geophys', 'doi', 'two', 'thu', 'space physic', 'res lett', 'provide', 'can', 'th', 'de', 'also', 're', 'res', 'lett', 'res lett', 'will', 'however', 'org', 'well', 'within', 'white paper', 'doi', 'http', 'https', 'figure', 'observation', 'observations', 'measurement', 'understanding', 'journal']) stylecloud.gen_stylecloud(text=text, - icon_name='fas fa-sun', + icon_name=shape, # To select the shape, pick a name from https://fontawesome.com/icons?d=gallery&m=free palette='colorbrewer.diverging.Spectral_11', background_color='black', gradient='radial', - size=1024, + size=1920, custom_stopwords=stop_words) @@ -103,9 +85,10 @@ def make_word_cloud(text): do_download = False if do_download: download_white_paper_titles() - download_white_paper_pdfs() - df_titles = read_white_paper_titles() + if do_download: + download_white_paper_pdfs(df_titles) + text_papers = read_white_paper_pdfs() #plot_data(df_titles) generate_stats(df_titles, text_papers) diff --git a/heliodecadal word cloud papers.png b/heliodecadal word cloud papers.png index 9e0e1aa..a60de15 100644 Binary files a/heliodecadal word cloud papers.png and b/heliodecadal word cloud papers.png differ diff --git a/merge_pdfs.py b/merge_pdfs.py new file mode 100644 index 0000000..3e7f951 --- /dev/null +++ b/merge_pdfs.py @@ -0,0 +1,10 @@ +from PyPDF2 import PdfMerger, PdfReader +import glob + +mergedObject = PdfMerger() +filenames = glob.glob("white_papers/*.pdf") + +for file in filenames: + mergedObject.append(PdfReader(file, 'rb')) + +mergedObject.write("All Decadal White Papers.pdf") \ No newline at end of file