Initial import

rsimon · rsimon · commit f47f0461776b · 2023-01-15T17:54:29.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+**/__pycache__
+downloads/*.csv
+downloads/*.json
+config.py
diff --git a/config.py.template b/config.py.template
@@ -0,0 +1,21 @@
+#######################################
+# Recogito access information
+#######################################
+RECOGITO_URL = 'https://recogito.pelagios.org'
+
+# Your user credentials
+RECOGITO_USER = 'test'
+RECOGITO_PW = 'test'
+
+#######################################
+# Config for 'download' script
+#######################################
+
+# If you want to download the contents of a 
+# folder in your Recogito workspace, set the folder ID here.
+# Set to False to download from the workspace root
+DOWNLOAD_FOLDER = False # 'f65b8f0f-d510-42b2-945f-d28f92743c14'
+
+# Destination folder (on your machine) where downloaded annotation 
+# should be stored
+DOWNLOAD_ANNOTATIONS_TO = './downloads'
diff --git a/download.py b/download.py
@@ -0,0 +1,55 @@
+import config as cfg
+import json
+import logging 
+import time
+
+from recogito.recogito_client import RecogitoAPI
+
+root = logging.getLogger()
+root.setLevel(logging.INFO)
+
+#####
+# Stores the JSON-LD annotations to a file named according to the document title
+#####
+def store_annotations_json(document_title, annotations):
+  with open(f'{cfg.DOWNLOAD_ANNOTATIONS_TO}/{document_title}.json', 'w') as outfile:
+    json.dump(annotations, outfile, indent=2)
+
+#####
+# Stores the CSV annotations to a file named according to the document title
+#####
+def store_annotations_csv(document_title, annotations):
+  with open(f'{cfg.DOWNLOAD_ANNOTATIONS_TO}/{document_title}.csv', 'w') as outfile:
+    outfile.write(annotations)
+
+###############################
+#
+# Download process starts here
+#
+###############################
+try:
+  client = RecogitoAPI.login({
+    'username': cfg.RECOGITO_USER,
+    'password': cfg.RECOGITO_PW, 
+    'server_url': cfg.RECOGITO_URL
+  })
+  
+  items = [ i for i in client.list_directory(cfg.DOWNLOAD_FOLDER)['items'] if i['type'] == 'DOCUMENT' ]
+  logging.info(f'Downloading data for {len(items)} documents')
+
+  for item in items:
+    doc_id = item['id']
+    logging.info(f'Downloading data for {item["title"]}')
+
+    annotations_json = client.get_annotations(doc_id)
+    annotations_csv = client.get_annotations(doc_id, 'csv')
+
+    logging.info(f'  Document has {len(annotations_json)} annotations')
+
+    store_annotations_json(item['title'], annotations_json)
+    store_annotations_csv(item['title'], annotations_csv)
+
+    time.sleep(0.2)
+
+except Exception as e:
+  logging.error(f'Error: {str(e)}')
diff --git a/downloads/.gitkeep b/downloads/.gitkeep
diff --git a/recogito/__init__.py b/recogito/__init__.py
diff --git a/recogito/recogito_client.py b/recogito/recogito_client.py
@@ -0,0 +1,142 @@
+import logging
+import requests
+
+class RecogitoAPI:
+
+  def __init__(self, config, session):
+    self.config = config
+    self.session = session
+
+  """
+  Config object needs the following props
+
+  {
+    'username': <recogito username>,
+    'password': <recogito password>,
+    'server_url': <recogito server base URL>
+  }
+  """
+  @classmethod
+  def login(cls, conf):
+    logging.info(f'Logging in as: {conf["username"]}')
+
+    payload = { 'username': conf['username'], 'password': conf['password'] }
+
+    session = requests.Session()
+    response = session.post(f'{conf["server_url"]}/login', data=payload)
+
+    if response.status_code == 200:
+      return cls(conf, session)
+    else:
+      raise Exception(f'Login failed with code: {response.status_code}')
+
+  """
+  Lists the user directory (root or folder with the given ID)
+  """
+  def list_directory(self, folder = None):
+    url = f'{self.config["server_url"]}/api/directory/my/{folder}' \
+      if (folder) else f'{self.config["server_url"]}/api/directory/my'
+
+    return self.session.get(url).json()
+
+  """
+  Uploading one document (with multiple files) to the workspace.
+  Shape of the document object: { 'title': <title>, 'files': [ <list of filepaths> ] }
+  """
+  def upload_document(self, document, folder = None):
+
+    def init_new_document(title):
+      response = self.session.post(f'{self.config["server_url"]}/my/upload', files={ 'title': (None, title) })
+      return response.json()['id']
+
+    def upload_file(filepath, upload_id):
+      payload = { 'file': open(filepath, 'rb') }
+      return self.session.post(f'{self.config["server_url"]}/my/upload/{upload_id}/file', files=payload)
+
+    def finalize_document(upload_id):
+      if (folder):
+        return self.session.post(f'{self.config["server_url"]}/my/upload/{upload_id}/finalize?folder={folder}')
+      else:
+        return self.session.post(f'{self.config["server_url"]}/my/upload/{upload_id}/finalize')
+
+    logging.info(f'Initiating upload: {document["title"]}')
+
+    upload_id = init_new_document(document['title'])
+
+    for f in document['files']:
+      response = upload_file(f, upload_id)
+
+      if response.status_code != 200:
+        raise Exception(f'Upload failed with code: {response.status_code}')
+
+    response = finalize_document(upload_id)
+
+    if response.status_code != 200:
+      raise Exception(f'Could not finalize upload - failed with code: {response.status_code}')
+
+    doc_id = response.json()['document_id']
+    logging.info(f'Upload successful: {doc_id}')
+
+    return doc_id
+
+  """
+  Shares the document with the given ID with the given user accounts
+  """
+  def share_document(self, doc_id, users):
+    for username in users:
+      response = self.session.put(f'{self.config["server_url"]}/document/{doc_id}/settings/collaborator', json={
+        'collaborator': username, 'access_level': 'WRITE'
+      })   
+
+      if response.status_code != 200:
+        raise Exception(f'Could not share with user "{username}" - failed with code: {response.status_code}')
+      else:
+        logging.info(f'Shared {doc_id} with user "{username}"')
+
+  """
+  Deletes the document with the given ID - not reversable, use at your own risk
+  """
+  def delete_document(self, doc_id):
+    response = self.session.delete(f'{self.config["server_url"]}/api/document/{doc_id}')  
+
+    if response.status_code != 200:
+      raise Exception(f'Error deleting document {doc_id} - failed with code: {response.status_code}')
+
+  """
+  Download list of collaborators on this document
+  """
+  def list_collaborators(self, doc_id):
+    return self.session.get(f'{self.config["server_url"]}/document/{doc_id}/settings/collaborators').json()
+
+  """
+  Sets a predfined tagging vocabulary for the document with the given ID
+  """
+  def set_tag_vocab(self, doc_id, terms):
+    response = self.session.post(f'{self.config["server_url"]}/document/{doc_id}/settings/prefs/tag-vocab', json=terms)
+
+    if response.status_code != 200:
+      raise Exception(f'Could not set tag vocab for {doc_id} - failed with code: {response.status_code}')
+    else:
+      logging.info(f'Set tag vocab for {doc_id}')
+
+  """
+  Download JSON-LD annotations for the given document
+  """
+  def get_annotations(self, doc_id, format = 'json-ld'):
+    if (format == 'json-ld'):
+      return self.session.get(f'{self.config["server_url"]}/document/{doc_id}/downloads/annotations/jsonld').json()
+    elif (format == 'csv'):
+      return self.session.get(f'{self.config["server_url"]}/document/{doc_id}/downloads/annotations/csv').content.decode('utf-8')
+
+  """
+  Download a backup of the document to the given filepath
+  """
+  def download_backup(self, doc_id, destination_file):
+    download_url = f'{self.config["server_url"]}/document/{doc_id}/settings/zip-export'
+
+    with self.session.get(download_url, stream=True) as r:
+      r.raise_for_status()
+          
+      with open(destination_file, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=8192): 
+          f.write(chunk)