From e6aca66a2a95ffb9835a9f4690a5a0aadad49d59 Mon Sep 17 00:00:00 2001 From: crockettz Date: Wed, 10 Aug 2022 10:09:05 -0400 Subject: [PATCH 1/4] Adding DOI submission script --- submit_doi.py | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 submit_doi.py diff --git a/submit_doi.py b/submit_doi.py new file mode 100644 index 0000000..78c00fd --- /dev/null +++ b/submit_doi.py @@ -0,0 +1,139 @@ +import argparse +import pandas as pd +import ostiapi +import xml.etree.ElementTree as ET +import re +from dict2xml import dict2xml +from datetime import datetime +from bs4 import BeautifulSoup +import requests +import datetime +parser = argparse.ArgumentParser() +parser.add_argument("account", help="Enter account name for submitting records") +parser.add_argument("password",help="Enter account password") +parser.add_argument("user_data",help="Enter file path for User Super Summary") +parser.add_argument("url",help="Enter URL for the static Narrative to receive the DOI") +parser.add_argument('--reserve',help='Enter True to only reserve, not submit',action='store_true') +parser.add_argument("--test_mode",help="Enter True to send record in testmode",action='store_true') +parser.add_argument("--update_record",help="Enter OSTI record ID to update an existing record") + +args = parser.parse_args() +if args.test_mode: + print("Operating in testmode") + ostiapi.testmode() +if args.update_record: + osti_id = args.update_record + print("Updating record {}".format(osti_id)) + +print("account",args.account) +print("password",args.password) +print("User data file",args.user_data) +print("SN URL",args.url) + +usersummary = pd.read_excel(args.user_data) +usersummary = usersummary.fillna('blank') +def gen_record(url): + ''' + Read through a static Narrative to find all the information required for DOI submission, and prompt for anything missing. + ''' + soup = BeautifulSoup(requests.get(url).content,'html.parser') + ## Constants + site_code = 'KBASE (U.S. Department of Energy Systems Biology Knowledgebase)' + dataset_type = 'GD' + BER = 'USDOE Office of Science (SC), Biological and Environmental Research (BER)' + ## From static Narrative + wsid = url[url.find('/n/')+3:url.rfind('/',0,-1)] + version = url[url.rfind('/',0,-1)+1:-1] + infix = '{}.{}'.format(wsid,version) + title = soup.find('title').text + author_list = [] + research_orgs = [] + keywords = '' + contract_nos = '' + abstract = '' + for d in soup.find_all('div'): + if d.get('class')==['kb-author-list']: + for a in d.find_all('a'): + author_dict = {} + # User super summary info pulled up separately in case I need to batch these in the future + author_frame = usersummary.loc[usersummary['display_name']==a.text] + # If they only have first name and last name in their profile + if len(a.text.split(' '))==2: + author_dict['last_name'] = a.text.split(' ')[1] + author_dict['first_name'] = a.text.split(' ')[0] + # If they have middle name or compound last name or only a single name in their profile + else: + print("Enter name for: {}".format(a.text)) + author_dict['last_name'] = input('Family name: ') + author_dict['first_name'] = input('Given name: ') + author_dict['middle_name'] = input('Middle name: ') + if author_frame['email'].to_list()[0]!='blank': + author_dict['private_email']=author_frame['email'].to_list()[0] + if author_frame['orcid'].to_list()[0]!='blank': + author_dict['orcid_id']=author_frame['orcid'].to_list()[0] + if author_frame['institution'].to_list()[0]!='blank': + author_dict['affiliation_name']=author_frame['institution'].to_list()[0] + if author_frame['institution'].to_list()[0] not in research_orgs: + research_orgs.append(author_frame['institution'].to_list()[0]) + author_list.append(author_dict) + if d.get('class')==['branding']: + datestring = d.text.strip('\n') + datestring = datestring[datestring.find(' ')+1:] + pub_date = datetime.datetime.strptime(datestring, '%B %d, %Y').strftime('%m/%d/%Y') + # User defined/custom classes + if d.get('class') == ['user-abstract']: + abstract = d.text + for m in soup.find_all('meta'): + if m.get('name') == 'user-keywords': + keywords = m.get('content') + if m.get('name') == 'user-doi-funding': + contract_nos = m.get('content') + + research_org = '' + for ro in research_orgs: + research_org+=ro+';' + research_org = research_org[:-1] + doi_list = [] + # Finding all the DOIs with regex. App DOIs are already in
  • s, so asking users to do likewise + for l in soup.find_all('li'): + if l.text.find('doi')!=-1: + doi = re.search('10.[0-9]*/\S*',l.text.lower())[0].strip('.') + if doi not in doi_list: + doi_list.append(doi) + related_identifiers = [{'related_identifier':x,'relation_type':"Cites",'related_identifier_type':"DOI"} for x in doi_list] + + # Manually entering abstract, keywords, contract numbers if they didn't include in HTML + if abstract == '': + abstract = input('No abstract found. Enter manual value: ') + if contract_nos == '': + contract_nos = input('No contract numbers found. Enter manual value: ') + if keywords == '': + keywords = input('No keywords found. Enter manual value: ') + if contract_nos == '': + contract_nos = 'N/A' + # Building record dict + record = { + 'title':title, + 'dataset_type':dataset_type, + 'authors':author_list, + 'publication_date':pub_date, + 'site_url':url, + 'contract_nos':contract_nos, + 'sponsor_org':BER, + 'keyword':keywords, + 'description':abstract, + 'research_org':research_org, + 'doi_infix':infix, + 'related_identifiers':related_identifiers + } + return(record) +record = gen_record(args.url) +if args.reserve: + submit = ostiapi.reserve(record,args.account,args.password) +else: + submit = ostiapi.post(record,args.account,args.password) +# Save the record with DOI from OSTI's as a backup +fname = record['record']['doi'].replace('.','_').replace('/','-') +xml = dict2xml(submit) +with open('{}.xml'.format(fname),'w') as f: + f.write(xml) From 17d6e149eb86ec52235f518a71abdb47b4b92a76 Mon Sep 17 00:00:00 2001 From: crockettz Date: Wed, 10 Aug 2022 10:12:10 -0400 Subject: [PATCH 2/4] Adding DOI submission script --- submit_doi.py => osti_scripts/submit_doi.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename submit_doi.py => osti_scripts/submit_doi.py (100%) diff --git a/submit_doi.py b/osti_scripts/submit_doi.py similarity index 100% rename from submit_doi.py rename to osti_scripts/submit_doi.py From 28eb465d27f6531288614ed4a8baa3402e7b9f01 Mon Sep 17 00:00:00 2001 From: crockettz Date: Wed, 10 Aug 2022 17:16:00 -0400 Subject: [PATCH 3/4] Adding script to get KBase DOIs --- osti_scripts/retrieve_records.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 osti_scripts/retrieve_records.py diff --git a/osti_scripts/retrieve_records.py b/osti_scripts/retrieve_records.py new file mode 100644 index 0000000..36563c5 --- /dev/null +++ b/osti_scripts/retrieve_records.py @@ -0,0 +1,25 @@ +import ostiapi +import pandas as pd +from datetime import datetime +import argparse +today = datetime.now() +parser = argparse.ArgumentParser() +parser.add_argument("account", help="Enter account name for submitting records") +parser.add_argument("password",help="Enter account password") +args = parser.parse_args() +account = args.account +pw = args.password +records = ostiapi.get({'site_input_code':'KBASE','rows':500},account,pw)['record'] +simplified_records = {} +for i in records: + title = i['title'] + doi = i['doi'] + status = i['@status'] + try: + url = i['site_url'] + except: + url = 'None' + simplified_records[doi]=[title,url,status] +df = pd.DataFrame.from_dict(simplified_records,orient='index',columns=['Title','URL','Status']) +df.index.name='DOI' +df.to_csv('KBase_dois_{}_{}_{}.csv'.format(today.year,today.month,today.day)) \ No newline at end of file From 9f220e51a2867e350bf151568cc0b244b655265b Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Fri, 9 Sep 2022 15:28:55 +0000 Subject: [PATCH 4/4] Auto-commit black formatting changes --- osti_scripts/retrieve_records.py | 25 ++-- osti_scripts/submit_doi.py | 192 +++++++++++++++++-------------- 2 files changed, 121 insertions(+), 96 deletions(-) diff --git a/osti_scripts/retrieve_records.py b/osti_scripts/retrieve_records.py index 36563c5..48b7198 100644 --- a/osti_scripts/retrieve_records.py +++ b/osti_scripts/retrieve_records.py @@ -2,24 +2,27 @@ import pandas as pd from datetime import datetime import argparse + today = datetime.now() parser = argparse.ArgumentParser() parser.add_argument("account", help="Enter account name for submitting records") -parser.add_argument("password",help="Enter account password") +parser.add_argument("password", help="Enter account password") args = parser.parse_args() account = args.account pw = args.password -records = ostiapi.get({'site_input_code':'KBASE','rows':500},account,pw)['record'] +records = ostiapi.get({"site_input_code": "KBASE", "rows": 500}, account, pw)["record"] simplified_records = {} for i in records: - title = i['title'] - doi = i['doi'] - status = i['@status'] + title = i["title"] + doi = i["doi"] + status = i["@status"] try: - url = i['site_url'] + url = i["site_url"] except: - url = 'None' - simplified_records[doi]=[title,url,status] -df = pd.DataFrame.from_dict(simplified_records,orient='index',columns=['Title','URL','Status']) -df.index.name='DOI' -df.to_csv('KBase_dois_{}_{}_{}.csv'.format(today.year,today.month,today.day)) \ No newline at end of file + url = "None" + simplified_records[doi] = [title, url, status] +df = pd.DataFrame.from_dict( + simplified_records, orient="index", columns=["Title", "URL", "Status"] +) +df.index.name = "DOI" +df.to_csv("KBase_dois_{}_{}_{}.csv".format(today.year, today.month, today.day)) diff --git a/osti_scripts/submit_doi.py b/osti_scripts/submit_doi.py index 78c00fd..0b95718 100644 --- a/osti_scripts/submit_doi.py +++ b/osti_scripts/submit_doi.py @@ -8,14 +8,21 @@ from bs4 import BeautifulSoup import requests import datetime + parser = argparse.ArgumentParser() parser.add_argument("account", help="Enter account name for submitting records") -parser.add_argument("password",help="Enter account password") -parser.add_argument("user_data",help="Enter file path for User Super Summary") -parser.add_argument("url",help="Enter URL for the static Narrative to receive the DOI") -parser.add_argument('--reserve',help='Enter True to only reserve, not submit',action='store_true') -parser.add_argument("--test_mode",help="Enter True to send record in testmode",action='store_true') -parser.add_argument("--update_record",help="Enter OSTI record ID to update an existing record") +parser.add_argument("password", help="Enter account password") +parser.add_argument("user_data", help="Enter file path for User Super Summary") +parser.add_argument("url", help="Enter URL for the static Narrative to receive the DOI") +parser.add_argument( + "--reserve", help="Enter True to only reserve, not submit", action="store_true" +) +parser.add_argument( + "--test_mode", help="Enter True to send record in testmode", action="store_true" +) +parser.add_argument( + "--update_record", help="Enter OSTI record ID to update an existing record" +) args = parser.parse_args() if args.test_mode: @@ -25,115 +32,130 @@ osti_id = args.update_record print("Updating record {}".format(osti_id)) -print("account",args.account) -print("password",args.password) -print("User data file",args.user_data) -print("SN URL",args.url) +print("account", args.account) +print("password", args.password) +print("User data file", args.user_data) +print("SN URL", args.url) usersummary = pd.read_excel(args.user_data) -usersummary = usersummary.fillna('blank') +usersummary = usersummary.fillna("blank") + + def gen_record(url): - ''' + """ Read through a static Narrative to find all the information required for DOI submission, and prompt for anything missing. - ''' - soup = BeautifulSoup(requests.get(url).content,'html.parser') + """ + soup = BeautifulSoup(requests.get(url).content, "html.parser") ## Constants - site_code = 'KBASE (U.S. Department of Energy Systems Biology Knowledgebase)' - dataset_type = 'GD' - BER = 'USDOE Office of Science (SC), Biological and Environmental Research (BER)' + site_code = "KBASE (U.S. Department of Energy Systems Biology Knowledgebase)" + dataset_type = "GD" + BER = "USDOE Office of Science (SC), Biological and Environmental Research (BER)" ## From static Narrative - wsid = url[url.find('/n/')+3:url.rfind('/',0,-1)] - version = url[url.rfind('/',0,-1)+1:-1] - infix = '{}.{}'.format(wsid,version) - title = soup.find('title').text + wsid = url[url.find("/n/") + 3 : url.rfind("/", 0, -1)] + version = url[url.rfind("/", 0, -1) + 1 : -1] + infix = "{}.{}".format(wsid, version) + title = soup.find("title").text author_list = [] research_orgs = [] - keywords = '' - contract_nos = '' - abstract = '' - for d in soup.find_all('div'): - if d.get('class')==['kb-author-list']: - for a in d.find_all('a'): + keywords = "" + contract_nos = "" + abstract = "" + for d in soup.find_all("div"): + if d.get("class") == ["kb-author-list"]: + for a in d.find_all("a"): author_dict = {} # User super summary info pulled up separately in case I need to batch these in the future - author_frame = usersummary.loc[usersummary['display_name']==a.text] + author_frame = usersummary.loc[usersummary["display_name"] == a.text] # If they only have first name and last name in their profile - if len(a.text.split(' '))==2: - author_dict['last_name'] = a.text.split(' ')[1] - author_dict['first_name'] = a.text.split(' ')[0] + if len(a.text.split(" ")) == 2: + author_dict["last_name"] = a.text.split(" ")[1] + author_dict["first_name"] = a.text.split(" ")[0] # If they have middle name or compound last name or only a single name in their profile else: print("Enter name for: {}".format(a.text)) - author_dict['last_name'] = input('Family name: ') - author_dict['first_name'] = input('Given name: ') - author_dict['middle_name'] = input('Middle name: ') - if author_frame['email'].to_list()[0]!='blank': - author_dict['private_email']=author_frame['email'].to_list()[0] - if author_frame['orcid'].to_list()[0]!='blank': - author_dict['orcid_id']=author_frame['orcid'].to_list()[0] - if author_frame['institution'].to_list()[0]!='blank': - author_dict['affiliation_name']=author_frame['institution'].to_list()[0] - if author_frame['institution'].to_list()[0] not in research_orgs: - research_orgs.append(author_frame['institution'].to_list()[0]) + author_dict["last_name"] = input("Family name: ") + author_dict["first_name"] = input("Given name: ") + author_dict["middle_name"] = input("Middle name: ") + if author_frame["email"].to_list()[0] != "blank": + author_dict["private_email"] = author_frame["email"].to_list()[0] + if author_frame["orcid"].to_list()[0] != "blank": + author_dict["orcid_id"] = author_frame["orcid"].to_list()[0] + if author_frame["institution"].to_list()[0] != "blank": + author_dict["affiliation_name"] = author_frame[ + "institution" + ].to_list()[0] + if author_frame["institution"].to_list()[0] not in research_orgs: + research_orgs.append(author_frame["institution"].to_list()[0]) author_list.append(author_dict) - if d.get('class')==['branding']: - datestring = d.text.strip('\n') - datestring = datestring[datestring.find(' ')+1:] - pub_date = datetime.datetime.strptime(datestring, '%B %d, %Y').strftime('%m/%d/%Y') + if d.get("class") == ["branding"]: + datestring = d.text.strip("\n") + datestring = datestring[datestring.find(" ") + 1 :] + pub_date = datetime.datetime.strptime(datestring, "%B %d, %Y").strftime( + "%m/%d/%Y" + ) # User defined/custom classes - if d.get('class') == ['user-abstract']: + if d.get("class") == ["user-abstract"]: abstract = d.text - for m in soup.find_all('meta'): - if m.get('name') == 'user-keywords': - keywords = m.get('content') - if m.get('name') == 'user-doi-funding': - contract_nos = m.get('content') - - research_org = '' + for m in soup.find_all("meta"): + if m.get("name") == "user-keywords": + keywords = m.get("content") + if m.get("name") == "user-doi-funding": + contract_nos = m.get("content") + + research_org = "" for ro in research_orgs: - research_org+=ro+';' + research_org += ro + ";" research_org = research_org[:-1] doi_list = [] # Finding all the DOIs with regex. App DOIs are already in
  • s, so asking users to do likewise - for l in soup.find_all('li'): - if l.text.find('doi')!=-1: - doi = re.search('10.[0-9]*/\S*',l.text.lower())[0].strip('.') + for l in soup.find_all("li"): + if l.text.find("doi") != -1: + doi = re.search("10.[0-9]*/\S*", l.text.lower())[0].strip(".") if doi not in doi_list: doi_list.append(doi) - related_identifiers = [{'related_identifier':x,'relation_type':"Cites",'related_identifier_type':"DOI"} for x in doi_list] - + related_identifiers = [ + { + "related_identifier": x, + "relation_type": "Cites", + "related_identifier_type": "DOI", + } + for x in doi_list + ] + # Manually entering abstract, keywords, contract numbers if they didn't include in HTML - if abstract == '': - abstract = input('No abstract found. Enter manual value: ') - if contract_nos == '': - contract_nos = input('No contract numbers found. Enter manual value: ') - if keywords == '': - keywords = input('No keywords found. Enter manual value: ') - if contract_nos == '': - contract_nos = 'N/A' + if abstract == "": + abstract = input("No abstract found. Enter manual value: ") + if contract_nos == "": + contract_nos = input("No contract numbers found. Enter manual value: ") + if keywords == "": + keywords = input("No keywords found. Enter manual value: ") + if contract_nos == "": + contract_nos = "N/A" # Building record dict record = { - 'title':title, - 'dataset_type':dataset_type, - 'authors':author_list, - 'publication_date':pub_date, - 'site_url':url, - 'contract_nos':contract_nos, - 'sponsor_org':BER, - 'keyword':keywords, - 'description':abstract, - 'research_org':research_org, - 'doi_infix':infix, - 'related_identifiers':related_identifiers + "title": title, + "dataset_type": dataset_type, + "authors": author_list, + "publication_date": pub_date, + "site_url": url, + "contract_nos": contract_nos, + "sponsor_org": BER, + "keyword": keywords, + "description": abstract, + "research_org": research_org, + "doi_infix": infix, + "related_identifiers": related_identifiers, } - return(record) + return record + + record = gen_record(args.url) if args.reserve: - submit = ostiapi.reserve(record,args.account,args.password) + submit = ostiapi.reserve(record, args.account, args.password) else: - submit = ostiapi.post(record,args.account,args.password) + submit = ostiapi.post(record, args.account, args.password) # Save the record with DOI from OSTI's as a backup -fname = record['record']['doi'].replace('.','_').replace('/','-') +fname = record["record"]["doi"].replace(".", "_").replace("/", "-") xml = dict2xml(submit) -with open('{}.xml'.format(fname),'w') as f: +with open("{}.xml".format(fname), "w") as f: f.write(xml)