Skip to content

Commit f47f046

Browse files
committed
Initial import
1 parent 345b72c commit f47f046

File tree

6 files changed

+222
-0
lines changed

6 files changed

+222
-0
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
**/__pycache__
2+
downloads/*.csv
3+
downloads/*.json
4+
config.py

config.py.template

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#######################################
2+
# Recogito access information
3+
#######################################
4+
RECOGITO_URL = 'https://recogito.pelagios.org'
5+
6+
# Your user credentials
7+
RECOGITO_USER = 'test'
8+
RECOGITO_PW = 'test'
9+
10+
#######################################
11+
# Config for 'download' script
12+
#######################################
13+
14+
# If you want to download the contents of a
15+
# folder in your Recogito workspace, set the folder ID here.
16+
# Set to False to download from the workspace root
17+
DOWNLOAD_FOLDER = False # 'f65b8f0f-d510-42b2-945f-d28f92743c14'
18+
19+
# Destination folder (on your machine) where downloaded annotation
20+
# should be stored
21+
DOWNLOAD_ANNOTATIONS_TO = './downloads'

download.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import config as cfg
2+
import json
3+
import logging
4+
import time
5+
6+
from recogito.recogito_client import RecogitoAPI
7+
8+
root = logging.getLogger()
9+
root.setLevel(logging.INFO)
10+
11+
#####
12+
# Stores the JSON-LD annotations to a file named according to the document title
13+
#####
14+
def store_annotations_json(document_title, annotations):
15+
with open(f'{cfg.DOWNLOAD_ANNOTATIONS_TO}/{document_title}.json', 'w') as outfile:
16+
json.dump(annotations, outfile, indent=2)
17+
18+
#####
19+
# Stores the CSV annotations to a file named according to the document title
20+
#####
21+
def store_annotations_csv(document_title, annotations):
22+
with open(f'{cfg.DOWNLOAD_ANNOTATIONS_TO}/{document_title}.csv', 'w') as outfile:
23+
outfile.write(annotations)
24+
25+
###############################
26+
#
27+
# Download process starts here
28+
#
29+
###############################
30+
try:
31+
client = RecogitoAPI.login({
32+
'username': cfg.RECOGITO_USER,
33+
'password': cfg.RECOGITO_PW,
34+
'server_url': cfg.RECOGITO_URL
35+
})
36+
37+
items = [ i for i in client.list_directory(cfg.DOWNLOAD_FOLDER)['items'] if i['type'] == 'DOCUMENT' ]
38+
logging.info(f'Downloading data for {len(items)} documents')
39+
40+
for item in items:
41+
doc_id = item['id']
42+
logging.info(f'Downloading data for {item["title"]}')
43+
44+
annotations_json = client.get_annotations(doc_id)
45+
annotations_csv = client.get_annotations(doc_id, 'csv')
46+
47+
logging.info(f' Document has {len(annotations_json)} annotations')
48+
49+
store_annotations_json(item['title'], annotations_json)
50+
store_annotations_csv(item['title'], annotations_csv)
51+
52+
time.sleep(0.2)
53+
54+
except Exception as e:
55+
logging.error(f'Error: {str(e)}')

downloads/.gitkeep

Whitespace-only changes.

recogito/__init__.py

Whitespace-only changes.

recogito/recogito_client.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import logging
2+
import requests
3+
4+
class RecogitoAPI:
5+
6+
def __init__(self, config, session):
7+
self.config = config
8+
self.session = session
9+
10+
"""
11+
Config object needs the following props
12+
13+
{
14+
'username': <recogito username>,
15+
'password': <recogito password>,
16+
'server_url': <recogito server base URL>
17+
}
18+
"""
19+
@classmethod
20+
def login(cls, conf):
21+
logging.info(f'Logging in as: {conf["username"]}')
22+
23+
payload = { 'username': conf['username'], 'password': conf['password'] }
24+
25+
session = requests.Session()
26+
response = session.post(f'{conf["server_url"]}/login', data=payload)
27+
28+
if response.status_code == 200:
29+
return cls(conf, session)
30+
else:
31+
raise Exception(f'Login failed with code: {response.status_code}')
32+
33+
"""
34+
Lists the user directory (root or folder with the given ID)
35+
"""
36+
def list_directory(self, folder = None):
37+
url = f'{self.config["server_url"]}/api/directory/my/{folder}' \
38+
if (folder) else f'{self.config["server_url"]}/api/directory/my'
39+
40+
return self.session.get(url).json()
41+
42+
"""
43+
Uploading one document (with multiple files) to the workspace.
44+
Shape of the document object: { 'title': <title>, 'files': [ <list of filepaths> ] }
45+
"""
46+
def upload_document(self, document, folder = None):
47+
48+
def init_new_document(title):
49+
response = self.session.post(f'{self.config["server_url"]}/my/upload', files={ 'title': (None, title) })
50+
return response.json()['id']
51+
52+
def upload_file(filepath, upload_id):
53+
payload = { 'file': open(filepath, 'rb') }
54+
return self.session.post(f'{self.config["server_url"]}/my/upload/{upload_id}/file', files=payload)
55+
56+
def finalize_document(upload_id):
57+
if (folder):
58+
return self.session.post(f'{self.config["server_url"]}/my/upload/{upload_id}/finalize?folder={folder}')
59+
else:
60+
return self.session.post(f'{self.config["server_url"]}/my/upload/{upload_id}/finalize')
61+
62+
logging.info(f'Initiating upload: {document["title"]}')
63+
64+
upload_id = init_new_document(document['title'])
65+
66+
for f in document['files']:
67+
response = upload_file(f, upload_id)
68+
69+
if response.status_code != 200:
70+
raise Exception(f'Upload failed with code: {response.status_code}')
71+
72+
response = finalize_document(upload_id)
73+
74+
if response.status_code != 200:
75+
raise Exception(f'Could not finalize upload - failed with code: {response.status_code}')
76+
77+
doc_id = response.json()['document_id']
78+
logging.info(f'Upload successful: {doc_id}')
79+
80+
return doc_id
81+
82+
"""
83+
Shares the document with the given ID with the given user accounts
84+
"""
85+
def share_document(self, doc_id, users):
86+
for username in users:
87+
response = self.session.put(f'{self.config["server_url"]}/document/{doc_id}/settings/collaborator', json={
88+
'collaborator': username, 'access_level': 'WRITE'
89+
})
90+
91+
if response.status_code != 200:
92+
raise Exception(f'Could not share with user "{username}" - failed with code: {response.status_code}')
93+
else:
94+
logging.info(f'Shared {doc_id} with user "{username}"')
95+
96+
"""
97+
Deletes the document with the given ID - not reversable, use at your own risk
98+
"""
99+
def delete_document(self, doc_id):
100+
response = self.session.delete(f'{self.config["server_url"]}/api/document/{doc_id}')
101+
102+
if response.status_code != 200:
103+
raise Exception(f'Error deleting document {doc_id} - failed with code: {response.status_code}')
104+
105+
"""
106+
Download list of collaborators on this document
107+
"""
108+
def list_collaborators(self, doc_id):
109+
return self.session.get(f'{self.config["server_url"]}/document/{doc_id}/settings/collaborators').json()
110+
111+
"""
112+
Sets a predfined tagging vocabulary for the document with the given ID
113+
"""
114+
def set_tag_vocab(self, doc_id, terms):
115+
response = self.session.post(f'{self.config["server_url"]}/document/{doc_id}/settings/prefs/tag-vocab', json=terms)
116+
117+
if response.status_code != 200:
118+
raise Exception(f'Could not set tag vocab for {doc_id} - failed with code: {response.status_code}')
119+
else:
120+
logging.info(f'Set tag vocab for {doc_id}')
121+
122+
"""
123+
Download JSON-LD annotations for the given document
124+
"""
125+
def get_annotations(self, doc_id, format = 'json-ld'):
126+
if (format == 'json-ld'):
127+
return self.session.get(f'{self.config["server_url"]}/document/{doc_id}/downloads/annotations/jsonld').json()
128+
elif (format == 'csv'):
129+
return self.session.get(f'{self.config["server_url"]}/document/{doc_id}/downloads/annotations/csv').content.decode('utf-8')
130+
131+
"""
132+
Download a backup of the document to the given filepath
133+
"""
134+
def download_backup(self, doc_id, destination_file):
135+
download_url = f'{self.config["server_url"]}/document/{doc_id}/settings/zip-export'
136+
137+
with self.session.get(download_url, stream=True) as r:
138+
r.raise_for_status()
139+
140+
with open(destination_file, 'wb') as f:
141+
for chunk in r.iter_content(chunk_size=8192):
142+
f.write(chunk)

0 commit comments

Comments
 (0)