Add download script and black formatting

tmorrell · tmorrell · commit eae1945ade98 · 2021-04-09T14:27:58.000-07:00
diff --git a/caltechdata_api/1639.json b/caltechdata_api/1639.json
@@ -0,0 +1 @@
+{"alternateIdentifiers": [{"alternateIdentifier": "1639", "alternateIdentifierType": "CaltechDATA_Identifier"}], "descriptions": [{"descriptionType": "Other", "description": "This repo contains Custom R scripts organized and summarized Choreography output files."}], "fundingReferences": [{"funderName": "Canadian Institutes of Health Research Doctoral Research Award"}, {"funderName": "Natural Sciences and Engineering Research Council"}, {"awardNumber": {"awardNumber": "PJT-165947"}, "funderName": "Canadian Institutes of Health Research"}], "language": "eng", "relatedIdentifiers": [{"relatedIdentifier": "10.17912/micropub.biology.000307", "relationType": "IsPartOf", "relatedIdentifierType": "DOI"}, {"relatedIdentifier": "https://github.com/troymcdiarmid/MWT_Wildtype_Auxin/releases/tag/v1.0", "relationType": "IsIdenticalTo", "relatedIdentifierType": "URL"}], "resourceType": {"resourceTypeGeneral": "Software"}, "rightsList": [{"rights": "cc-by", "rightsURI": "https://creativecommons.org/licenses/by/4.0/"}], "subjects": [{"subject": "C. elegans; Multi-Worm Tracker R scripts;  behavioral analysis; Choreography software output"}], "version": "1.0", "titles": [{"title": "Auxin does not affect a suite of morphological or behavioral phenotypes in two wild-type <i>C. elegans</i> strains"}], "dates": [{"date": "2020-09-30", "dateType": "Accepted"}, {"date": "2020-10-02", "dateType": "Issued"}], "publicationYear": "2020", "publisher": "CaltechDATA", "creators": [{"affiliations": ["Djavad Mowafaghian Centre for Brain Health, University of British Columbia, 2211 Wesbrook Mall, Vancouver, British Columbia V6T 2B5, Canada"], "creatorName": "Troy McDiarmid"}]}
diff --git a/caltechdata_api/1639.xml b/caltechdata_api/1639.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd">
+  <identifier identifierType="DOI">10.22002/D1.1639</identifier>
+    <creators>
+    <creator>
+      <creatorName>Troy McDiarmid</creatorName>
+      <affiliation>Djavad Mowafaghian Centre for Brain Health, University of British Columbia, 2211 Wesbrook Mall, Vancouver, British Columbia V6T 2B5, Canada</affiliation>
+    </creator>
+  </creators>
+  <titles>
+    <title>Auxin does not affect a suite of morphological or behavioral phenotypes in two wild-type &lt;i&gt;C. elegans&lt;/i&gt; strains</title>
+  </titles>
+  <publisher>CaltechDATA</publisher>
+  <publicationYear>2020</publicationYear>
+  <subjects>
+    <subject>C. elegans; Multi-Worm Tracker R scripts;  behavioral analysis; Choreography software output</subject>
+  </subjects>
+  <dates>
+    <date dateType="Accepted">2020-09-30</date>
+    <date dateType="Issued">2020-10-02</date>
+  </dates>
+  <language>eng</language>
+  <resourceType resourceTypeGeneral="Software"/>
+  <alternateIdentifiers>
+    <alternateIdentifier alternateIdentifierType="CaltechDATA_Identifier">1639</alternateIdentifier>
+  </alternateIdentifiers>
+  <relatedIdentifiers>
+    <relatedIdentifier relatedIdentifierType="DOI" relationType="IsPartOf">10.17912/micropub.biology.000307</relatedIdentifier>
+    <relatedIdentifier relatedIdentifierType="URL" relationType="IsIdenticalTo">https://github.com/troymcdiarmid/MWT_Wildtype_Auxin/releases/tag/v1.0</relatedIdentifier>
+  </relatedIdentifiers>
+  <version>1.0</version>
+  <rightsList>
+    <rights rightsURI="https://creativecommons.org/licenses/by/4.0/">cc-by</rights>
+  </rightsList>
+  <descriptions>
+    <description descriptionType="Other">This repo contains Custom R scripts organized and summarized Choreography output files.</description>
+  </descriptions>
+  <fundingReferences>
+    <fundingReference>
+      <funderName>Canadian Institutes of Health Research Doctoral Research Award</funderName>
+    </fundingReference>
+    <fundingReference>
+      <funderName>Natural Sciences and Engineering Research Council</funderName>
+    </fundingReference>
+    <fundingReference>
+      <funderName>Canadian Institutes of Health Research</funderName>
+      <awardNumber>PJT-165947</awardNumber>
+    </fundingReference>
+  </fundingReferences>
+</resource>
diff --git a/caltechdata_api/__init__.py b/caltechdata_api/__init__.py
@@ -3,3 +3,4 @@
 from .customize_schema import customize_schema
 from .decustomize_schema import decustomize_schema
 from .get_metadata import get_metadata
+from .download_file import download_file
diff --git a/caltechdata_api/caltechdata_edit.py b/caltechdata_api/caltechdata_edit.py
@@ -49,7 +49,9 @@ def caltechdata_unembargo(token, ids, production=False):
         return response.text
 
 
-def caltechdata_edit(token, ids, metadata={}, files={}, delete={}, production=False, schema="40"):
+def caltechdata_edit(
+    token, ids, metadata={}, files={}, delete={}, production=False, schema="40"
+):
     """Including files will only replaces files if they have the same name
     The delete option will delete any existing files with a given file extension
     There are more file operations that could be implemented"""
@@ -72,7 +74,9 @@ def caltechdata_edit(token, ids, metadata={}, files={}, delete={}, production=Fa
         api_url = "https://cd-sandbox.tind.io/api/record/"
 
     if metadata:
-        metadata = customize_schema.customize_schema(copy.deepcopy(metadata),schema=schema)
+        metadata = customize_schema.customize_schema(
+            copy.deepcopy(metadata), schema=schema
+        )
 
     for idv in ids:
         metadata["id"] = idv
@@ -135,7 +139,7 @@ def caltechdata_add(token, ids, metadata={}, files={}, production=False, schema=
     headers = {"Authorization": "Bearer %s" % token, "Content-type": "application/json"}
 
     if metadata:
-        metadata = customize_schema.customize_schema(copy.deepcopy(metadata),schema)
+        metadata = customize_schema.customize_schema(copy.deepcopy(metadata), schema)
 
     fjson = {}
 
diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py
@@ -79,7 +79,7 @@ def caltechdata_write(metadata, token, files=[], production=False, schema="40"):
 
     fileinfo = []
 
-    newdata = customize_schema.customize_schema(copy.deepcopy(metadata),schema=schema)
+    newdata = customize_schema.customize_schema(copy.deepcopy(metadata), schema=schema)
 
     if files:
         for f in files:
diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py
@@ -82,6 +82,7 @@ def customize_schema_4(json_record):
 
     return json_record
 
+
 def customize_schema_43(json_record):
     json_record = customize_standard(json_record)
     # Extract identifiers and label as DOI or alternativeIdentifiers
@@ -143,7 +144,7 @@ def customize_schema_43(json_record):
                 affiliations = []
                 for aff in a["affiliation"]:
                     affiliations.append(aff["name"])
-                new['affiliation' ] = a['affiliation']
+                new["affiliation"] = a["affiliation"]
                 new["contributorAffiliation"] = affiliations
             new["contributorName"] = c["name"]
             if "contributorType" in c:
@@ -153,20 +154,21 @@ def customize_schema_43(json_record):
             newc.append(new)
         json_record["contributors"] = newc
 
-    #Funding organization
+    # Funding organization
     if "fundingReferences" in json_record:
         for funding in json_record["fundingReferences"]:
-            if 'awardNumber' in funding:
-                funding['awardNumber'] = {'awardNumber':funding['awardNumber']}
+            if "awardNumber" in funding:
+                funding["awardNumber"] = {"awardNumber": funding["awardNumber"]}
 
-    #resourceTypeGeneral
-    typeg = json_record['types']['resourceTypeGeneral']
-    json_record['resourceType'] = {'resourceTypeGeneral':typeg}
+    # resourceTypeGeneral
+    typeg = json_record["types"]["resourceTypeGeneral"]
+    json_record["resourceType"] = {"resourceTypeGeneral": typeg}
 
     print(json_record)
 
     return json_record
 
+
 def customize_standard(json_record):
 
     # Extract subjects to single string
@@ -223,7 +225,7 @@ def customize_standard(json_record):
             d["relevantDateType"] = d.pop("dateType")
         json_record["relevantDates"] = json_record.pop("dates")
     else:
-        json_record["publicationDate"] =  date.today().isoformat()
+        json_record["publicationDate"] = date.today().isoformat()
 
     # license
     if "rightsList" in json_record:
diff --git a/caltechdata_api/decustomize_schema.py b/caltechdata_api/decustomize_schema.py
@@ -161,7 +161,7 @@ def decustomize_standard(json_record, pass_emails, pass_media, pass_owner):
         "brief_title",
         "brief_summary",
         "resource_type",
-        "final_actions"
+        "final_actions",
     ]
     if pass_owner == False:
         others.append("owners")
@@ -181,7 +181,10 @@ def decustomize_schema_43(json_record, pass_emails, pass_media, pass_owner):
     if "doi" in json_record:
         doi = json_record["doi"]
         identifiers.append(
-            {"identifier": json_record["doi"], "identifierType": "DOI",}
+            {
+                "identifier": json_record["doi"],
+                "identifierType": "DOI",
+            }
         )
         del json_record["doi"]
 
@@ -221,9 +224,9 @@ def decustomize_schema_43(json_record, pass_emails, pass_media, pass_owner):
         for a in authors:
             new = {}
             if "authorAffiliation" in a:
-                #Prefer full affiliation block
+                # Prefer full affiliation block
                 if "affiliation" in a:
-                    new["affiliation"] = a["affiliation"] 
+                    new["affiliation"] = a["affiliation"]
                 else:
                     if isinstance(a["authorAffiliation"], list) == False:
                         a["authorAffiliation"] = [a["authorAffiliation"]]
@@ -259,7 +262,7 @@ def decustomize_schema_43(json_record, pass_emails, pass_media, pass_owner):
             new = {}
             if "contributorAffiliation" in c:
                 if "affiliation" in c:
-                    new["affiliation"] = c["affiliation"] 
+                    new["affiliation"] = c["affiliation"]
                 else:
                     if isinstance(c["contributorAffiliation"], list) == False:
                         c["contributorAffiliation"] = [c["contributorAffiliation"]]
diff --git a/caltechdata_api/download_file.py b/caltechdata_api/download_file.py
@@ -0,0 +1,56 @@
+import requests, argparse
+from tqdm import tnrange, tqdm_notebook
+
+
+def download_file(doi, fname=None, media_type=None):
+    """Download a file  listed in the media API for a DataCite DOI"""
+    api_url = "https://api.datacite.org/dois/" + doi + "/media"
+    r = requests.get(api_url).json()
+    data = r["data"]
+    if media_type == None:
+        url = data[0]["attributes"]["url"]
+    else:
+        for media in data:
+            if media["attributes"]["mediaType"] == media_type:
+                url = media["attributes"]
+    r = requests.get(url, stream=True)
+    # Set file name
+    if fname == None:
+        fname = doi.replace("/", "-")
+    # Download file with progress bar
+    if r.status_code == 403:
+        print("File Unavailable")
+    if "content-length" not in r.headers:
+        print("Did not get file")
+    else:
+        with open(fname, "wb") as f:
+            total_length = int(r.headers.get("content-length"))
+            pbar = tnrange(int(total_length / 1024), unit="B")
+            for chunk in r.iter_content(chunk_size=1024):
+                if chunk:
+                    pbar.update()
+                    f.write(chunk)
+        return fname
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="download_file queries the DaiaCite Media API\
+    and downloads the file associated with a DOI"
+    )
+    parser.add_argument(
+        "dois",
+        nargs="+",
+        help="The DOI for files to be downloaded",
+    )
+    parser.add_argument(
+        "-fname", default=None, help="Username for basic authentication"
+    )
+    parser.add_argument(
+        "-media_type", default=None, help="Password for basic authentication"
+    )
+
+    args = parser.parse_args()
+
+    for doi in args.dois:
+        download_file(doi, args.fname, args.media_type)
diff --git a/caltechdata_api/get_metadata.py b/caltechdata_api/get_metadata.py
@@ -44,7 +44,6 @@ def get_metadata(idv, production=True, auth=None, emails=False, schema="40"):
                 errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
                 for error in errors:
                     print(error.message)
-                exit()
         if schema == "43":
             try:
                 assert schema43.validate(metadata)
@@ -53,7 +52,6 @@ def get_metadata(idv, production=True, auth=None, emails=False, schema="40"):
                 errors = sorted(v.iter_errors(instance), key=lambda e: e.path)
                 for error in errors:
                     print(error.message)
-                exit()
 
     return metadata
 
@@ -75,7 +73,7 @@ def get_metadata(idv, production=True, auth=None, emails=False, schema="40"):
     parser.add_argument("-xml", dest="save_xml", action="store_true")
     parser.add_argument("-auth_user", help="Username for basic authentication")
     parser.add_argument("-auth_pass", help="Password for basic authentication")
-    parser.add_argument("-schema", default = "40",help="Schema Version")
+    parser.add_argument("-schema", default="40", help="Schema Version")
 
     args = parser.parse_args()
 
diff --git a/codemeta.json b/codemeta.json
@@ -6,7 +6,7 @@
   "codeRepository": "https://github.com/caltechlibrary/caltechdata_api",
   "issueTracker": "https://github.com/caltechlibrary/caltechdata_api/issues",
   "license": "https://data.caltech.edu/license",
-  "version": "0.1.6",
+  "version": "0.1.7",
   "author": [
     {
       "@type": "Person",
diff --git a/example_download_and_upload.ipynb b/example_download_and_upload.ipynb
@@ -0,0 +1,96 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from caltechdata_api import download_file, caltechdata_write\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7e6735a34e5c4448bbeb72920ed7b963",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=665896), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "#By default will download to file named 10.22002-D1.1945\n",
+    "#Can provide filename of interest using fname option\n",
+    "filen = download_file('10.22002/D1.1945')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully created record https://cd-sandbox.tind.io/records/733.  \n"
+     ]
+    }
+   ],
+   "source": [
+    "#Now write a file to CaltechDATA test instance (cd-sandbox.tind.io)\n",
+    "\n",
+    "token = 'PASTE TOKEN HERE'\n",
+    "\n",
+    "metaf = open('example.json', 'r')\n",
+    "metadata = json.load(metaf)\n",
+    "filen = 'logo.gif'\n",
+    "\n",
+    "production = False\n",
+    "\n",
+    "response = caltechdata_write(metadata, token, filen, production)\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@ def read(fname):
 
 # What packages are required for this module to be executed?
 REQUIRED = [
-    'requests','datacite'
+    'requests','datacite','tqdm'
 ]
 
 # What packages are optional?

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"alternateIdentifiers": [{"alternateIdentifier": "1639", "alternateIdentifierType": "CaltechDATA_Identifier"}], "descriptions": [{"descriptionType": "Other", "description": "This repo contains Custom R scripts organized and summarized Choreography output files."}], "fundingReferences": [{"funderName": "Canadian Institutes of Health Research Doctoral Research Award"}, {"funderName": "Natural Sciences and Engineering Research Council"}, {"awardNumber": {"awardNumber": "PJT-165947"}, "funderName": "Canadian Institutes of Health Research"}], "language": "eng", "relatedIdentifiers": [{"relatedIdentifier": "10.17912/micropub.biology.000307", "relationType": "IsPartOf", "relatedIdentifierType": "DOI"}, {"relatedIdentifier": "https://github.com/troymcdiarmid/MWT_Wildtype_Auxin/releases/tag/v1.0", "relationType": "IsIdenticalTo", "relatedIdentifierType": "URL"}], "resourceType": {"resourceTypeGeneral": "Software"}, "rightsList": [{"rights": "cc-by", "rightsURI": "https://creativecommons.org/licenses/by/4.0/"}], "subjects": [{"subject": "C. elegans; Multi-Worm Tracker R scripts; behavioral analysis; Choreography software output"}], "version": "1.0", "titles": [{"title": "Auxin does not affect a suite of morphological or behavioral phenotypes in two wild-type <i>C. elegans</i> strains"}], "dates": [{"date": "2020-09-30", "dateType": "Accepted"}, {"date": "2020-10-02", "dateType": "Issued"}], "publicationYear": "2020", "publisher": "CaltechDATA", "creators": [{"affiliations": ["Djavad Mowafaghian Centre for Brain Health, University of British Columbia, 2211 Wesbrook Mall, Vancouver, British Columbia V6T 2B5, Canada"], "creatorName": "Troy McDiarmid"}]}
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`"codeRepository": "https://github.com/caltechlibrary/caltechdata_api",`
`7`	`7`	`"issueTracker": "https://github.com/caltechlibrary/caltechdata_api/issues",`
`8`	`8`	`"license": "https://data.caltech.edu/license",`
`9`		`- "version": "0.1.6",`
	`9`	`+ "version": "0.1.7",`
`10`	`10`	`"author": [`
`11`	`11`	`{`
`12`	`12`	`"@type": "Person",`