diff --git a/.github/workflows/csv2readme.py b/.github/workflows/csv2readme.py new file mode 100644 index 0000000..4c08262 --- /dev/null +++ b/.github/workflows/csv2readme.py @@ -0,0 +1,89 @@ +import os +import csv +import textwrap +import datetime + +def update_readme(): + csv_dir = '.github/excel2csv' + readme_path = 'README.md' + + if not os.path.exists(csv_dir): + print(f"Directory {csv_dir} does not exist.") + return + + files = [] + names = [] + emails = [] + comments = [] + line_counts = [] + + for filename in os.listdir(csv_dir): + if filename.endswith('.csv'): + with open(os.path.join(csv_dir, filename), 'r') as csvfile: + csvreader = csv.reader(csvfile) + current_name = '' + current_email = '' + current_comment = '' + line_count = 0 + for i, row in enumerate(csvreader): + if i == 0: + current_name = row[1].strip() # Extract Name + elif i == 1: + current_email = row[1].strip() # Extract Email + elif i == 4: + current_comment = row[1].strip() # Extract Comment + # Count lines starting from line 8 + if i >= 7: + line_count += 1 + # Append the extracted data for the current file to the lists + files.append(filename) + names.append(current_name) + emails.append(current_email) + comments.append(current_comment) + line_counts.append(line_count) + + if not files: + print("No CSV files found. Skipping README update.") + return + + readme = '' + + if files: + # Writes README.md preamble + readme += textwrap.dedent(f''' + ## This Dataset Contributions + + **Name:** {' / '.join(set(names))} +
+ **Email:** {' / '.join(set(emails))} + ''') + + for i, comment in enumerate(comments): + readme += textwrap.dedent(f''' + ``` + File: {files[i]} + Datapoints: {line_counts[i]} + Comment: {comment} + ``` + ''') + + readme += textwrap.dedent(f''' + **Last time updated:** {datetime.datetime.now().strftime("%m-%d-%Y %I:%M%p").lower()} + ''') + + readme += textwrap.dedent(''' + ## The ULTERA Database + This template repository was developed for contributing to the [**ULTERA Database**](https://ultera.org) carried under the [**DOE ARPA-E ULTIMATE program**](https://arpa-e.energy.gov/?q=arpa-e-programs/ultimate) that aims to develop a new generation of materials for turbine blades in gas turbines and related applications. + + The main scope of this dataset is collecting data on compositionally complex alloys (CCAs), also known as high entropy alloys (HEAs) and multi-principle-element alloys (MPEAs), with extra attention given to (1) high-temperature (refractory) mechanical data, (2) phases present under different processing conditions. Although low-entropy alloys (incl. binaries) are typically not presented to the end-user (or counted in statistics), some are present and used in ML efforts; thus **all high-quality alloy data contributions are welcome!** + + For further information, please visit the [ULTERA-contribute](https://github.com/PhasesResearchLab/ULTERA-contribute/) repository. + ''') + + with open(readme_path, 'w') as readme_file: + readme_file.write(readme) + + print(f"README.md has been updated with the latest contributions.") + +if __name__ == '__main__': + update_readme() \ No newline at end of file diff --git a/pyqalloy-contribute/pyqalloy-contribute/excel2csv.py b/.github/workflows/excel2csv.py similarity index 71% rename from pyqalloy-contribute/pyqalloy-contribute/excel2csv.py rename to .github/workflows/excel2csv.py index f43396b..04dc0d6 100644 --- a/pyqalloy-contribute/pyqalloy-contribute/excel2csv.py +++ b/.github/workflows/excel2csv.py @@ -1,83 +1,86 @@ -# %% -import pandas as pd -import sys -import json -import os - - -def convert(datasheet: str): - '''This function converts an PyQAlloy-compliant Excel datasheet into a CSV file for the purpose of - tracking changes in the data collection and curation, while preserving the original template/datasheet - file along with its style and formatting. The CSV file is named after the original datasheet file, with - the extension changed to .csv. The metadata is stored in the first few lines of the CSV file, and the - data is stored in the rest of the file. - - Args: - datasheet: Path to PyQAlloy-compliant Excel datasheet file. - ''' - - # Import metadata - print('Reading the metadata.') - metaDF = pd.read_excel(datasheet, - usecols="A:F", - nrows=4) - meta = metaDF.to_json(orient="split") - metaParsed = json.loads(meta, strict=False)['data'] - - # Format metadata into a dictionary - metaData = { - 'Name': metaParsed[0][1], - 'Email': metaParsed[1][1], - 'Direct Fetched': metaParsed[2][1], - 'Hand Fetched': metaParsed[3][1], - 'Comment': metaParsed[0][5] - } - - # Logging progress into a CSV table - dataFileName = datasheet.replace('.xlsx', '').replace('.xls', '') - - # Import data - print('Importing data.') - df2 = pd.read_excel(datasheet, - usecols="A:N", - nrows=20000, - skiprows=8) - # Convert the dataset - parsed = df2.to_json(orient="split") - labels = json.loads(parsed, strict=False)['columns'] - data = json.loads(parsed, strict=False)['data'] - - print('Imported ' + str(len(data)) + ' datapoints.') - - with open(dataFileName + '.csv', 'w+') as outFile: - # Write the metadata - for line, val in metaData.items(): - outFile.write(line + ':,' + str(val) + '\n') - outFile.write('\n') - # Write the data - outFile.write(','.join(labels) + '\n') - for line in data: - outFile.write(','.join(str(val) for val in line) + '\n') - - print('Successfully converted ' + datasheet + ' to ' + dataFileName + '.csv\n') - - -def detectDatasheetsAndConvert(path: str): - '''This function detects all PyQAlloy-compliant Excel datasheets in a directory and converts them into - CSV files. It skips the empty template file (template_v4.xlsx). - - Args: - path: Path to the directory containing PyQAlloy-compliant Excel datasheets. - ''' - - for file in os.listdir(path): - if file.endswith('.xlsx'): - if file not in ['template_v4.xlsx', 'template_v4_DatasetExample.xlsx']: - print('Converting ' + file) - convert(path + '/' + file) - else: - print('Skipping ' + file) - - -if __name__ == '__main__': - detectDatasheetsAndConvert(sys.argv[1]) +import pandas as pd +import fnmatch +import sys +import json +import os + +def convert(datasheet: str): + '''This function converts an ULTERA-compliant Excel datasheet into a CSV file for the purpose of + tracking changes in the data collection and curation, while preserving the original template/datasheet + file along with its style and formatting. The CSV file is named after the original datasheet file, with + the extension changed to .csv. The metadata is stored in the first few lines of the CSV file, and the + data is stored in the rest of the file. + + Args: + datasheet: Path to ULTERA-compliant Excel datasheet file. + ''' + + # Import metadata + print('Reading the metadata.') + metaDF = pd.read_excel(datasheet, + usecols="A:F", + nrows=4) + meta = metaDF.to_json(orient="split") + metaParsed = json.loads(meta, strict=False)['data'] + + # Format metadata into a dictionary + metaData = { + 'Name': metaParsed[0][1], + 'Email': metaParsed[1][1], + 'Direct Fetched': metaParsed[2][1], + 'Hand Fetched': metaParsed[3][1], + 'Comment': metaParsed[0][5] + } + + # Logging progress into a CSV table + dataFileName = datasheet.replace('.xlsx', '').replace('.xls', '') + + # Import data + print('Importing data.') + df2 = pd.read_excel(datasheet, + usecols="A:N", + nrows=20000, + skiprows=8) + # Convert the dataset + parsed = df2.to_json(orient="split") + labels = json.loads(parsed, strict=False)['columns'] + data = json.loads(parsed, strict=False)['data'] + + print('Imported ' + str(len(data)) + ' datapoints.') + + # Ensure the directory exists + output_dir = '.github/excel2csv' + os.makedirs(output_dir, exist_ok=True) + + with open(f'{output_dir}/{dataFileName}.csv', 'w+') as outFile: + # Write the metadata + for line, val in metaData.items(): + outFile.write(line + ':,' + str(val) + '\n') + outFile.write('\n') + # Write the data + outFile.write(','.join(labels) + '\n') + for line in data: + outFile.write(','.join(str(val) for val in line) + '\n') + + print(f'Successfully converted {datasheet} to {output_dir}/{dataFileName}.csv\n') + + +def detectDatasheetsAndConvert(path: str): + '''This function detects all ULTERA-compliant Excel datasheets in a directory and converts them into + CSV files. It skips the empty template file. + + Args: + path: Path to the directory containing ULTERA-compliant Excel datasheets. + ''' + + for file in os.listdir(path): + if file.endswith('.xlsx'): + if not fnmatch.fnmatch(file, 'template*.xlsx'): + print('Converting ' + file) + convert(path + '/' + file) + else: + print('Skipping ' + file) + + +if __name__ == '__main__': + detectDatasheetsAndConvert(sys.argv[1]) \ No newline at end of file diff --git a/.github/workflows/newfork.yml b/.github/workflows/newfork.yml new file mode 100644 index 0000000..c6ceb24 --- /dev/null +++ b/.github/workflows/newfork.yml @@ -0,0 +1,19 @@ +name: Fork Notification +on: fork + +jobs: + create-fork-issue: + runs-on: ubuntu-latest + permissions: + issues: write + steps: + - uses: actions/github-script@v6 + with: + script: | + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: 'New Contribute Fork Created', + body: `A new fork of this repository has been created by @${context.actor}\n\n@PhasesResearchLab/ultera-maintainers`, + labels: ['new fork'] + }); diff --git a/.github/workflows/excel2csv.yml b/.github/workflows/postcommit.yml similarity index 52% rename from .github/workflows/excel2csv.yml rename to .github/workflows/postcommit.yml index a85477b..5edd7a4 100644 --- a/.github/workflows/excel2csv.yml +++ b/.github/workflows/postcommit.yml @@ -1,32 +1,36 @@ -name: excel2csv - -on: [push] - -jobs: - run: - name: excel2csv - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Set up Python 3.10 - uses: actions/setup-python@v4 - with: - python-version: '3.10' - cache: 'pip' - cache-dependency-path: 'pyqalloy-contribute/requirements.txt' - - - name: Install Dependencies - run: | - python -m pip install -r pyqalloy-contribute/requirements.txt - - - name: Run excel2csv - run: | - python pyqalloy-contribute/pyqalloy-contribute/excel2csv.py . - - - name: Commit changes with Add & Commit - uses: EndBug/add-and-commit@v9 - with: - message: '(automatic) excel2csv Action for Data Tracking' - add: '*.csv' \ No newline at end of file +name: postcommit + +on: [push] + +jobs: + run: + name: postcommit + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install Dependencies + run: | + python -m pip install pandas openpyxl + + - name: Run excel2csv + run: | + python .github/workflows/excel2csv.py . + + - name: Run csv2readme + run: | + python .github/workflows/csv2readme.py . + + - name: Commit changes with Add & Commit + uses: EndBug/add-and-commit@v9 + with: + message: '(automatic) Action for Data Tracking' + add: | + .github/excel2csv/*.csv + README.md diff --git a/README.md b/README.md index e25e711..d7eef00 100644 --- a/README.md +++ b/README.md @@ -1,99 +1,29 @@ -ULTERA Data Templates Repository: [github.com/PhasesResearchLab/ULTERA-contribute](https://github.com/PhasesResearchLab/ULTERA-contribute) - -## This Dataset - -- _Your name, affiliation, and contact_ -- _Brief description_ -- _Anything else you like to say_ - -## ULTERA Database -This template repository was developed for contributing to the [**ULTERA Database**](https://ultera.org) carried under the -[**DOE ARPA-E ULTIMATE program**](https://arpa-e.energy.gov/?q=arpa-e-programs/ultimate) that -aims to develop a new generation of materials for turbine blades in gas turbines and related -applications. - -The main scope of this dataset is collecting data on compositionally complex alloys (CCAs), also known as high entropy alloys (HEAs) and multi-principle-element alloys (MPEAs), with extra attention given to (1) high-temperature (refractory) mechanical data, (2) phases present under different processing conditions. Although low-entropy alloys (incl. binaries) are typically not presented to the end-user (or counted in statistics), some are present and used in ML efforts; thus **all high-quality alloy data contributions are welcome!** - - - -## How to Contribute? -You pretty much only need to restructure your data into a spreadsheet. **Publishing should take less than brewing a coffee.** Simply: - -1. Fork this repository (button in the top-right corner). Please add a unique identifier (not belonging to any other fork) to the fork's name, i.e., ULTERA-contribute`-yourNameHere`, such as, e.g.: - - `-AlloyDataAMK`, `-amkrajewski`, `-10.20517-jmi.2021.05`, or `-jmi2021adetal`. - - Please note that on a personal GitHub account, you can only have a single fork of a repository; thus, if you want to upload data from multiple sources, it is advisable to follow one of the first two examples above. Then, you can keep each source in a separate spreadsheet. - -3. See how the sample template is filled. The fields have short descriptions and examples above them. The `templateSampleFilled_v4.xlsx` contains some filled examples. - -4. Remove the `templateSampleFilled_v4.xlsx` sample file (just to keep things neat) and rename `template_v4.xlsx` to something describing what you are uploading and that will help you remember what is inside, e.g., `refractory_bcc_heas`, `CrMoNiBased_DuctilityAndHardness`, `HardnessCollectionHEA` or, for single-publication data, the DOI `10.20517-jmi.2021.05`. - - _Avoid_ putting the version number or year in the name, as it will make correcting errors in the datasets much more difficult. - - -7. Fill out the spreadsheet with your data. Do not hesitate to [open an issue in this (source) repository](https://github.com/PhasesResearchLab/ULTERA-contribute/issues) in case you have any questions! - -8. (optional/recommended) Enable the `Issues` page for your fork by (1) going to `Settings`, (2) scrolling down to `Features`, and (3) checking the box next to Issues. This will allow others to let you know if they find any problems with your data, or just want to ask questions. - -9. Let us know your data is ready! We will clone your forked repository as a submodule and automatically process the data into the ULTERA through [the pushing meta-repository (github.com/PhasesResearchLab/ULTERA-push)](https://github.com/PhasesResearchLab/ULTERA-push) - -## Fancy Stuff Through Actions - -### (before you start) Enabling GitHub Actions - -All of the functionalities below rely on automated "actions" that happen at predetermined events (e.g. Excel2CSV converter will run every time you push a modification to your dataset; either a single commit of a set of them) and **should just work out-of-the box if enabled (easy 30s job)**. You will just wait to see the results in a minute or so or track progress under `Actions` tab in the top GitHub menu. - -To enable them, you will only need to: -1. Go to: Settings > Actions (General) > Actions permissions -2. Select "Allow all Actions and Reusable Workflows" -3. Save and refresh the page -4. Scroll down to "Workflow permissions" and select "Read and write permissions" -5. Save and refresh the page - -Now, you should be good to go! You can verify everything work by going to the aforementioned `Actions` tab in the top GitHub menu. - -### Tracking changes in data while retaining Excel style and functions - -When publishing a dataset, one usually has to make a choice between storing it in either (a) plaintext formats, like TXT, CSV, and JSON, or (b) in one of the rich data formats, like Excel/Word Spreadsheet or table in PDF of a paper. - -The first option, **plaintext**, is typically preferred by scientists focused on the data as a training input for modeling efforts (whether covering simple statistics or fancy machine learning models) since it is usually (1) easier to digest by computer programs, (2) doesn't introduce any ambiguities on what is stored in it (all data is plain without, e.g., rounded numbers), (3) every change to the dataset can be tracked by git (either locally or through services like GitHub or GitLab) so that researcher know exact changes between data right now versus let's say 7 months earlier, and perhaps most significantly (4) can reference and compare datasets by commits thus allowing precise statements like _"V13 of the model used additional 5 yield stress data points and 17 hardness data points for alloys #3,27,79 while skipping 2 outliers (#111,112) we identified in May as coming from contaminated experiments"._ - -The **rich format** options like Excel spreadsheets are typically preferred by scientists, as they provide a number of conveniences. They (1) allow styling of the dataset, ranging from, e.g., customizable spreadsheet table column widths to make display much clearer to view, through the possibility of setting small fonts when describing data fields, to the automated highlighting of values that are outside of an accepted range. They also (2) allow one to have one column with data in original units and automatically convert them as needed to different ones on the fly, (3) format the displayed precision to the liking of the researcher or community while retaining full precision, and even (4) store images inside them for reference. However, the rich formats are usually binary and thus cannot be tracked by git (see plaintext advantages) beyond versioning them and commenting changes. This works for small modifications but quickly gets out of hand as the dataset matures and tens or hundreds of changes are made. - -**In this contribution repository, we enable both at the same time.** The template you populate with your data is in the Excel Spreadsheet format and can be modified in any way that doesn't change its core structure. You can style it to your liking or even store some images outside of template fields. At the same time, every time you _push_ changes to the repository, an automated [GitHub action]((before-you-start)-Enabling-GitHub-Actions) defined in `excel2csv.yml` will convert all of your templates stored in the main directory of this repository into plain-text CSV files, add them to your git working branch, and, if there are any changes in your data, it will commit them under name `(automatic) excel2csv Action for Data Tracking`. Now, you will be able to both (1) see what has changed at the template modification time and (2) compare between these commits to see how your data was modified long-term. - - - -## I want to contribute in the future, but I'm not ready to make it public yet - -Forking a repository is an elegant one-click solution to clone the templates, make your contributions discoverable, and keep everything up-to-date. One caveat is that GitHub will not allow you to change the visibility of the repository - it will have to be public. It has a number of advantages, like enabling the community to review your data and efficiently communicate issues by simply opening them on the fork; however, we know that some people may want to keep their data private until they are ready to publish it. - -To create a contribution to ULTERA (or any other dataset following the template schema) you will need to _import_ the repository. You can do so by going to the `Create new...` in the top-right corner of GitHub page and selecting `Import Repository`. - -githubimport - -Once the page opens, paste the URL of the original repository: - - https://github.com/PhasesResearchLab/ULTERA-contribute - -Then select your repository name. Please follow the `ULTERA-contribute-*******` pattern and try to make the name informative. Lastly, select the visibility you would like to have. Go forward, wait a minute, and refresh the page; you should now see your data repository! - -Now, since it's not a fork, things get a bit more complex since you can't just click a button and synchronize your fork, resolving all the issues on the fly in GitHub. However, _if the modifications you make do not introduce any conflicts_ (keep up-to-date with template when introducing changes), you should be able to just add the public template repository as one of the remotes: - - git remote add public https://github.com/PhasesResearchLab/ULTERA-contribute - -and then, whenever you want to make your repository up-to-date, simply pull changes from ULTERA-contribute - - git pull public main - -and push it to yours: - - git push origin main - -With that, you should be ready to store all of your data and make it public when you are ready. Then just let us know, so we can add it to [the pushing meta-repository (github.com/PhasesResearchLab/ULTERA-push)](https://github.com/PhasesResearchLab/ULTERA-push) - - - - - +## The ULTERA Database +This template repository was developed for contributing to the [**ULTERA Database**](https://ultera.org) carried under the [**DOE ARPA-E ULTIMATE program**](https://arpa-e.energy.gov/?q=arpa-e-programs/ultimate) that aims to develop a new generation of materials for turbine blades in gas turbines and related applications. + +The main scope of this dataset is collecting data on compositionally complex alloys (CCAs), also known as high entropy alloys (HEAs) and multi-principle-element alloys (MPEAs), with extra attention given to (1) high-temperature (refractory) mechanical data, (2) phases present under different processing conditions. Although low-entropy alloys (incl. binaries) are typically not presented to the end-user (or counted in statistics), some are present and used in ML efforts; thus **all high-quality alloy data contributions are welcome!** + +## How to Contribute? +Contributing is pretty much restructuring your data into a spreadsheet. + +Before you start: + +1. Fork the [ULTERA-contribute](https://github.com/PhasesResearchLab/ULTERA-contribute/tree/main) repository, renaming your fork with a unique identifier, i.e., ULTERA-contribute`-yourUniqueIdentifier`, e.g.: `-PSU`, `-PhasesResearchLab`, `-researcherid`, etc. + +2. Enable `GitHub Actions` for your fork by (1) going to `Settings > Actions (General) > Actions permissions`, (2) Select `Allow all Actions and Reusable Workflows` and save, and (3) Scroll down to `Workflow permissions`, select `Read and write permissions`, and save. + +Once your forked repository is ready: + +1. Make a copy of [`template_v5.xlsx`](./template_v5.xlsx) and rename it to something describing what you are uploading to help you remember what is inside, e.g., `refractory_bcc_heas.xlsx`, `CrMoNiBased_DuctilityAndHardness.xlsx`, `HardnessCollectionHEA.xlsx`. + + PS: Keep your copied file in the root of the repository as this is the only directory monitored for new contributions. + +2. Fill out your spreadsheet copy with your datapoints, folowing the ULTERA manual [`manual_v1.pdf`](./manual_v1.pdf) instructions and examples provided in the template. + +3. Repeat steps 1 and 2 as necessary until you have completed adding all of your data to your repository. + +4. Once you're done, commit your changes to your repository and notify the ULTERA team by opening an issue at [ULTERA-contribute](https://github.com/PhasesResearchLab/ULTERA-contribute/issues/new?assignees=&labels=new+contribution&projects=&template=newcontribution.md&title=%5BNew+Contribution%5D). The [@PhasesResearchLab/ULTERA](https://github.com/orgs/PhasesResearchLab/teams/ULTERA) team will receive a notification of your contributions and will automatically analyze and process the data into the ULTERA database. + +For further instructions on the contribution proccess or *if you want to contribute without making your data public*, please refer to [`manual_v1.pdf`](./manual_v1.pdf). + +In case you have any questions, please do not hesitate to [open an issue](https://github.com/PhasesResearchLab/ULTERA-contribute/issues) to get help! diff --git a/assets/images/githubimport_screenshot.png b/assets/images/githubimport_screenshot.png deleted file mode 100644 index 2927e90..0000000 Binary files a/assets/images/githubimport_screenshot.png and /dev/null differ diff --git a/manual_v1.pdf b/manual_v1.pdf new file mode 100644 index 0000000..9c5f976 Binary files /dev/null and b/manual_v1.pdf differ diff --git a/pyqalloy-contribute/requirements.txt b/pyqalloy-contribute/requirements.txt deleted file mode 100644 index 2f5b8d3..0000000 --- a/pyqalloy-contribute/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pandas -openpyxl \ No newline at end of file diff --git a/template_v4.xlsx b/template_v4.xlsx deleted file mode 100644 index f26a074..0000000 Binary files a/template_v4.xlsx and /dev/null differ diff --git a/template_v4_DatasetExample.xlsx b/template_v4_DatasetExample.xlsx deleted file mode 100644 index 34d0d8e..0000000 Binary files a/template_v4_DatasetExample.xlsx and /dev/null differ diff --git a/template_v5.xlsx b/template_v5.xlsx new file mode 100644 index 0000000..e5addab Binary files /dev/null and b/template_v5.xlsx differ