From 1914b6518a5841410e0875286a2614f69e98ab35 Mon Sep 17 00:00:00 2001 From: Oscar Esteban Date: Fri, 22 Aug 2025 16:00:50 +0200 Subject: [PATCH 1/3] chore: simplify publications update workflow --- .github/workflows/update-publications.yml | 48 ++++++ scripts/update_publications.py | 178 ++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 .github/workflows/update-publications.yml create mode 100755 scripts/update_publications.py diff --git a/.github/workflows/update-publications.yml b/.github/workflows/update-publications.yml new file mode 100644 index 00000000000..2f1e29a1954 --- /dev/null +++ b/.github/workflows/update-publications.yml @@ -0,0 +1,48 @@ +name: Update publication data + +on: + schedule: + - cron: '0 0 * * 0' + workflow_dispatch: + +permissions: + contents: write + +jobs: + update: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup SSH + env: + SSH_PRIVATE_KEY: ${{ secrets.PUBLICATIONS_INDEX_SSH_KEY }} + run: | + mkdir -p ~/.ssh + echo "$SSH_PRIVATE_KEY" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan github.com >> ~/.ssh/known_hosts + + - name: Clone publications index + run: | + git clone --depth 1 --branch master git@github.com:oesteban/publications_index.git /tmp/publications_index + mv /tmp/publications_index/pub_journal.yml /tmp/publications_index/publications_database.yml + + - name: Install Python dependencies + run: pip install pyyaml + + - name: Update data files + run: | + python scripts/update_publications.py /tmp/publications_index/publications_database.yml _data + + - name: Commit and push changes + run: | + if [ -n "$(git status --porcelain _data/pub_journal.yml _data/pub_conference.yml _data/pub_other.yml)" ]; then + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git commit -am "chore: update publications data" + git push + else + echo "No changes to commit" + fi diff --git a/scripts/update_publications.py b/scripts/update_publications.py new file mode 100755 index 00000000000..5240eb3e96e --- /dev/null +++ b/scripts/update_publications.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +import sys +import yaml +from collections import defaultdict, OrderedDict + +def short_authors(authors): + """Return shortened author list: first, … last.""" + if not authors: + return "" + def fmt(name): + parts = [p.strip() for p in name.split(',')] + if len(parts) == 2: + last, firsts = parts + initials = ''.join(f"{w.strip()[0]}." for w in firsts.split()) + return f"{last}, {initials}" + return name + if len(authors) == 1: + return fmt(authors[0]) + return f"{fmt(authors[0])}, … {fmt(authors[-1])}" + +def article_citation(e): + pub = e.get('AbbrvPublication') or e.get('Publication', '') + vol = e.get('Volume') + num = e.get('Number') + pages = e.get('Pages') + citation = ( + f"{short_authors(e.get('Authors', []))} ({e.get('Year')}).\n" + f" {e.get('Title')}\n" + f" {pub}" + ) + if vol: + citation += f" {vol}" + if num: + citation += f"({num})" + if pages: + citation += f":{pages}" + citation += "." + return citation + +def conference_citation(e): + pub = e.get('AbbrvPublication') or e.get('Publication', '') + volume = e.get('Volume') + pages = e.get('Pages') + place = e.get('Place') + citation = ( + f"{short_authors(e.get('Authors', []))} ({e.get('Year')}).\n" + f" {e.get('Title')}\n" + f" {pub}" + ) + if volume: + citation += f" {volume}" + if pages: + citation += f", {pages}" + if place: + citation += f",\n {place}" + citation += "." + return citation + +def other_citation(e): + pub = e.get('AbbrvPublication') or e.get('Publication', '') + vol = e.get('Volume') + num = e.get('Number') + pages = e.get('Pages') + citation = ( + f"{short_authors(e.get('Authors', []))} ({e.get('Year')}).\n" + f" {e.get('Title')}\n" + f" {pub}" + ) + if vol: + citation += f" {vol}" + if num: + citation += f"({num})" + if pages: + citation += f":{pages}" + citation += "." + return citation + +def group_by_year(entries): + grouped = defaultdict(list) + for e in entries: + year = e.get('Year') + try: + year = int(year) + except Exception: + pass + grouped[year].append(e) + return OrderedDict(sorted(grouped.items(), key=lambda x: x[0], reverse=True)) + +class LiteralDumper(yaml.SafeDumper): + pass + +def str_presenter(dumper, data): + if '\n' in data: + return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') + return dumper.represent_scalar('tag:yaml.org,2002:str', data) + +LiteralDumper.add_representer(str, str_presenter) + +def bool_presenter(dumper, data): + return dumper.represent_scalar('tag:yaml.org,2002:bool', 'True' if data else 'False') + +LiteralDumper.add_representer(bool, bool_presenter) + +def write_articles(entries, path): + years = group_by_year(entries) + out = [] + for year, items in years.items(): + year_items = [] + for e in items: + item = { + 'Citations': e.get('Citations', ''), + 'DOI': e.get('DOI', ''), + 'Year': e.get('Year'), + 'Citation': article_citation(e), + } + oa_url = e.get('OA URL') + if oa_url: + item['OA'] = oa_url + elif e.get('OA'): + item['OA'] = True + year_items.append(item) + out.append({'Year': year, 'Items': year_items}) + with open(path, 'w') as f: + yaml.dump(out, f, Dumper=LiteralDumper, sort_keys=False, allow_unicode=True) + +def write_conferences(entries, path): + years = group_by_year(entries) + out = [] + for year, items in years.items(): + year_items = [] + for e in items: + item = { + 'DOI': e.get('DOI', '') or '', + 'URL': e.get('URL', '') or '', + 'OA URL': e.get('OA URL', '') or '', + 'Citation': conference_citation(e), + } + year_items.append(item) + out.append({'Year': year, 'Items': year_items}) + with open(path, 'w') as f: + yaml.dump(out, f, Dumper=LiteralDumper, sort_keys=False, allow_unicode=True) + +def write_others(entries, path): + years = group_by_year(entries) + out = [] + for year, items in years.items(): + year_items = [] + for e in items: + item = { + 'Type': e.get('Type', ''), + 'Citation': other_citation(e), + } + year_items.append(item) + out.append({'Year': year, 'Items': year_items}) + with open(path, 'w') as f: + yaml.dump(out, f, Dumper=LiteralDumper, sort_keys=False, allow_unicode=True) + +def main(src, outdir): + with open(src) as f: + data = yaml.safe_load(f) + articles, conferences, others = [], [], [] + for e in data: + t = e.get('Type', '').lower() + if t == 'article': + articles.append(e) + elif t in {'poster', 'oral'}: + conferences.append(e) + else: + others.append(e) + write_articles(articles, f"{outdir}/pub_journal.yml") + write_conferences(conferences, f"{outdir}/pub_conference.yml") + write_others(others, f"{outdir}/pub_other.yml") + +if __name__ == '__main__': + if len(sys.argv) != 3: + print('Usage: update_publications.py ') + sys.exit(1) + main(sys.argv[1], sys.argv[2]) From 080bb0be3fca6d365a37a3c722f1f8f6aa974889 Mon Sep 17 00:00:00 2001 From: Oscar Esteban Date: Tue, 26 Aug 2025 00:24:22 +0200 Subject: [PATCH 2/3] chore: add requirements for update script --- .github/workflows/update-publications.yml | 2 +- scripts/requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 scripts/requirements.txt diff --git a/.github/workflows/update-publications.yml b/.github/workflows/update-publications.yml index 2f1e29a1954..dc08b7ca65c 100644 --- a/.github/workflows/update-publications.yml +++ b/.github/workflows/update-publications.yml @@ -30,7 +30,7 @@ jobs: mv /tmp/publications_index/pub_journal.yml /tmp/publications_index/publications_database.yml - name: Install Python dependencies - run: pip install pyyaml + run: pip install -r scripts/requirements.txt - name: Update data files run: | diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 00000000000..c1a201db2d1 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +PyYAML>=6.0 From afc646bb6bb83332a7e0468aef707481e0f35d77 Mon Sep 17 00:00:00 2001 From: Oscar Esteban Date: Tue, 26 Aug 2025 00:24:26 +0200 Subject: [PATCH 3/3] chore: refine publication parser --- scripts/requirements.txt | 2 +- scripts/update_publications.py | 86 ++++++++++++++++++---------------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index c1a201db2d1..20b82128664 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1 +1 @@ -PyYAML>=6.0 +ruamel.yaml>=0.17 diff --git a/scripts/update_publications.py b/scripts/update_publications.py index 5240eb3e96e..125585421d3 100755 --- a/scripts/update_publications.py +++ b/scripts/update_publications.py @@ -1,7 +1,14 @@ #!/usr/bin/env python3 import sys -import yaml -from collections import defaultdict, OrderedDict +from datetime import date +from ruamel.yaml import YAML +from ruamel.yaml.scalarstring import LiteralScalarString + + +yaml = YAML() +yaml.default_flow_style = False +yaml.allow_unicode = True +yaml.boolean_representation = ['False', 'True'] def short_authors(authors): """Return shortened author list: first, … last.""" @@ -75,43 +82,38 @@ def other_citation(e): citation += "." return citation -def group_by_year(entries): - grouped = defaultdict(list) +def group_entries(entries): + """Return entries split into current, previous, and before previous year.""" + current = date.today().year + previous = current - 1 + groups = {current: [], previous: [], f"Before {previous}": []} for e in entries: - year = e.get('Year') try: - year = int(year) + year = int(e.get('Year')) except Exception: - pass - grouped[year].append(e) - return OrderedDict(sorted(grouped.items(), key=lambda x: x[0], reverse=True)) - -class LiteralDumper(yaml.SafeDumper): - pass - -def str_presenter(dumper, data): - if '\n' in data: - return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') - return dumper.represent_scalar('tag:yaml.org,2002:str', data) - -LiteralDumper.add_representer(str, str_presenter) - -def bool_presenter(dumper, data): - return dumper.represent_scalar('tag:yaml.org,2002:bool', 'True' if data else 'False') - -LiteralDumper.add_representer(bool, bool_presenter) + continue + if year == current: + groups[current].append(e) + elif year == previous: + groups[previous].append(e) + else: + groups[f"Before {previous}"].append(e) + for items in groups.values(): + items.sort(key=lambda x: int(x.get('Citations') or 0), reverse=True) + order = [current, previous, f"Before {previous}"] + return [(label, groups[label]) for label in order if groups[label]] def write_articles(entries, path): - years = group_by_year(entries) + sections = group_entries(entries) out = [] - for year, items in years.items(): + for label, items in sections: year_items = [] for e in items: item = { 'Citations': e.get('Citations', ''), 'DOI': e.get('DOI', ''), 'Year': e.get('Year'), - 'Citation': article_citation(e), + 'Citation': LiteralScalarString(article_citation(e)), } oa_url = e.get('OA URL') if oa_url: @@ -119,45 +121,47 @@ def write_articles(entries, path): elif e.get('OA'): item['OA'] = True year_items.append(item) - out.append({'Year': year, 'Items': year_items}) + out.append({'Year': label, 'Items': year_items}) with open(path, 'w') as f: - yaml.dump(out, f, Dumper=LiteralDumper, sort_keys=False, allow_unicode=True) + yaml.dump(out, f) + def write_conferences(entries, path): - years = group_by_year(entries) + sections = group_entries(entries) out = [] - for year, items in years.items(): + for label, items in sections: year_items = [] for e in items: item = { 'DOI': e.get('DOI', '') or '', 'URL': e.get('URL', '') or '', 'OA URL': e.get('OA URL', '') or '', - 'Citation': conference_citation(e), + 'Citation': LiteralScalarString(conference_citation(e)), } year_items.append(item) - out.append({'Year': year, 'Items': year_items}) + out.append({'Year': label, 'Items': year_items}) with open(path, 'w') as f: - yaml.dump(out, f, Dumper=LiteralDumper, sort_keys=False, allow_unicode=True) + yaml.dump(out, f) + def write_others(entries, path): - years = group_by_year(entries) + sections = group_entries(entries) out = [] - for year, items in years.items(): + for label, items in sections: year_items = [] for e in items: item = { - 'Type': e.get('Type', ''), - 'Citation': other_citation(e), + 'Type': e.get('Type', '').title(), + 'Citation': LiteralScalarString(other_citation(e)), } year_items.append(item) - out.append({'Year': year, 'Items': year_items}) + out.append({'Year': label, 'Items': year_items}) with open(path, 'w') as f: - yaml.dump(out, f, Dumper=LiteralDumper, sort_keys=False, allow_unicode=True) + yaml.dump(out, f) def main(src, outdir): with open(src) as f: - data = yaml.safe_load(f) + data = yaml.load(f) articles, conferences, others = [], [], [] for e in data: t = e.get('Type', '').lower()