Skip to content

Add ThermoML Archive dataset #118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions data/thermoml_archive/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
name: thermoml_archive
description: ThermoML is an XML-based IUPAC standard for the storage and exchange of experimental thermophysical and thermochemical property data. The ThermoML
archive is a subset of Thermodynamics Research Center (TRC) data holdings corresponding to cooperation between NIST TRC and five journals.
targets:
identifiers: []
license: https://www.nist.gov/open/license
num_points:
bibtex:
- "@article{Riccardi2022,\n title = {Towards improved {{FAIRness}} of the {{ThermoML Archive}}},\n author = {Riccardi, Demian and Trautt, Zachary\
\ and Bazyleva, Ala and Paulechka, Eugene and Diky, Vladimir and Magee, Joseph W. and Kazakov, Andrei F. and Townsend, Scott A. and Muzny, Chris D.},\n\
\ year = {2022},\n journal = {Journal of Computational Chemistry},\n volume = {43},\n number = {12},\n pages = {879--887},\n doi\
\ = {10.1002/jcc.26842},\n langid = {english}\n }"
templates:
fields:
links:
3 changes: 3 additions & 0 deletions data/thermoml_archive/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
git+https://github.com/sustainable-processes/thermopyl
tqdm
pyyaml
151 changes: 151 additions & 0 deletions data/thermoml_archive/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import hashlib
import pathlib
import tarfile
import warnings
from typing import BinaryIO

import pandas as pd
import requests
import tqdm
import yaml
from thermopyl import Parser as ThermoPylParser

from chemnlp.data_val.model import Dataset


def get_and_transform_data():
"""Downloads the archived version of ThermoML, extracts it and
parses the provided XML files with thermopyl to construct a flat csv.

"""

# get raw data
fname = "ThermoML.v2020-09-30.tgz"
download_path = pathlib.Path(__file__).parent / fname
remote_data_path = f"https://data.nist.gov/od/ds/mds2-2422/{fname}"
sha256_checksum = "231161b5e443dc1ae0e5da8429d86a88474cb722016e5b790817bb31c58d7ec2"
final_csv_path = pathlib.Path(__file__).parent / "thermoml_archive.csv"
final_expected_csv_checksum = (
"fc296f47c1877b6ace72f7aa4a80c489b80d0eb25ea3a59885d067e554378b08"
)

if not download_path.exists():
data = requests.get(remote_data_path)
with open(download_path, "wb") as f:
for chunk in tqdm.tqdm(
data.iter_content(chunk_size=8192), desc="Downloading archive"
):
f.write(chunk)

# check if checksum is correct
with open(download_path, "rb") as f:
received_hash = _sha256_chunked_file_digest(f)

if received_hash != sha256_checksum:
raise RuntimeError(
"Downloaded file did not match expected checksum -- "
"either a new version has been released or something has gone wrong!\n"
f"Expected: {sha256_checksum}\n"
f"Received: {received_hash}"
)

# Loop through journal DOI folders and scrape files
if final_csv_path.exists():
with open(final_csv_path, "rb") as f:
csv_sha256_checksum = _sha256_chunked_file_digest(f)

if csv_sha256_checksum != final_expected_csv_checksum:
warnings.warn(
"Old CSV file did not match expected checksum, will try to recreate."
)
final_csv_path.rename(final_csv_path.with_suffix(".old.csv"))

else:
print(f"Correct csv file already available at {final_csv_path}, exiting...")
return

# Extract tar.gz archive
with tarfile.open(download_path, "r:*") as tar:
tar.extractall(pathlib.Path(__file__).parent)

root_dois = ("10.1007", "10.1016", "10.1021")

num_points = 0
num_failed = 0
for doi in root_dois:
for path in tqdm.tqdm(
(pathlib.Path(__file__).parent / doi).glob("*.xml"),
desc=f"Looping over files in {doi}",
):
with open(path, "r") as f:
try:
pd.DataFrame(ThermoPylParser(path).parse()).to_csv(
final_csv_path, mode="a"
)
num_points += 1
except Exception:
num_failed += 1

print(f"Ingested {num_points} with {num_failed} failures.")

with open(final_csv_path, "rb") as f:
csv_hash = _sha256_chunked_file_digest(f)

if csv_hash != final_expected_csv_checksum:
warnings.warn(
"Final CSV file did not match expected checksum!\n"
f"Expected: {final_expected_csv_checksum}\n"
f"Received: {csv_hash}"
)

# create metadata
meta = Dataset(
**{
"name": "thermoml_archive",
"description": "ThermoML is an XML-based IUPAC standard for the storage and exchange of experimental thermophysical and thermochemical property data. The ThermoML archive is a subset of Thermodynamics Research Center (TRC) data holdings corresponding to cooperation between NIST TRC and five journals.", # noqa
"identifiers": [
{
"id": "",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it ok to not provide an id value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this is unfinished (as mentioned in PR desc), this is the main bit that needs feedback/further discussion.

"type": "inchi",
},
{
"id": "",
"type": "inchikey",
},
],
"license": "https://www.nist.gov/open/license",
"links": [
{
"url": "https://doi.org/10.18434/mds2-2422",
"description": "data publication",
},
{
"url": "https://www.nist.gov/publications/towards-improved-fairness-thermoml-archive",
"description": "NIST publication description",
},
{
"url": "https://trc.nist.gov/ThermoML",
"description": "Live database hosted at NIST Thermodynamics Research Center",
},
],
"num_points": num_points,
"bibtex": [
"@article{Riccardi2022,title = {Towards improved {{FAIRness}} of the {{ThermoML Archive}}},author = {Riccardi, Demian and Trautt, Zachary and Bazyleva, Ala and Paulechka, Eugene and Diky, Vladimir and Magee, Joseph W. and Kazakov, Andrei F. and Townsend, Scott A. and Muzny, Chris D.},year = {2022},journal = {Journal of Computational Chemistry},volume = {43},number = {12},pages = {879--887},doi = {10.1002/jcc.26842},langid = {english}}", # noqa
],
}
)
with open("meta.yaml", "w") as f:
yaml.dump(meta.dict(), f, sort_keys=False)


def _sha256_chunked_file_digest(fp: BinaryIO) -> str:
"""Compute the SHA256 digest of a file in chunks."""
sha256 = hashlib.sha256()
for chunk in tqdm.tqdm(iter(lambda: fp.read(8192), b""), desc="Checking hash"):
sha256.update(chunk)

return sha256.hexdigest()


if __name__ == "__main__":
get_and_transform_data()