Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .gitignore
Binary file not shown.
100 changes: 0 additions & 100 deletions 0_generate_basic_metadata.js

This file was deleted.

89 changes: 89 additions & 0 deletions 0_run_workflow.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
const { execSync } = require("child_process");
const fs = require("fs");
const path = require("path");

// === CLI Argument Parser ===
const args = process.argv.slice(2);
const getArg = (name) => {
const prefix = `--${name}=`;
const found = args.find((arg) => arg.startsWith(prefix));
return found ? parseInt(found.slice(prefix.length), 10) : undefined;
};
const startPage = getArg("start-page");
const endPage = getArg("end-page");
const batchSize = getArg("batch-size") || 10;

if (!startPage || !endPage || isNaN(startPage) || isNaN(endPage)) {
console.error("❌ Usage: node 0_run_workflow.js --start-page=10 --end-page=100 --batch-size=10");
process.exit(1);
}

function deletePdfFolder(page) {
const dirPath = path.join("pdf", `page_${page}`);
if (fs.existsSync(dirPath)) {
const files = fs.readdirSync(dirPath);
for (const file of files) {
if (file.endsWith(".pdf")) {
fs.unlinkSync(path.join(dirPath, file));
}
}
console.log(`🧹 Deleted PDF files in folder: ${dirPath}`);
}
}

async function runWorkflowBatch(batchStart, batchEnd) {
console.log(`\n🚀 Starting workflow for pages ${batchStart} - ${batchEnd}\n`);
const steps = [
{
name: "📥 Step 1️⃣: Fetching DOI JSON...",
command: `node 1_fetch_all_dois.js --start-page=${batchStart} --end-page=${batchEnd}`,
},
{
name: "📄 Step 2️⃣: Downloading PDFs...",
command: `node 2_fetch_all_pdfs.js --start-page=${batchStart} --end-page=${batchEnd}`,
},
{
name: "🧠 Step 3️⃣: Generating metadata...",
command: `node 3_generate_basic_metadata.js --start-page=${batchStart} --end-page=${batchEnd}`,
},
{
name: "🆙 Step 4️⃣: Uploading metadata to Irys...",
command: `node 4_upload_all_basic_metadata.js --start-page=${batchStart} --end-page=${batchEnd}`,
},
{
name: "📤 Step 5️⃣: Uploading PDFs to Irys...",
command: `node 5_upload_all_pdfs.js --start-page=${batchStart} --end-page=${batchEnd}`,
},
];

for (const step of steps) {
console.log(`\n${step.name}`);
try {
execSync(step.command, { stdio: "inherit" });
} catch (err) {
console.error(`❌ Step failed: ${err.message}`);
return false;
}
}

// cleanup pdf files in each page folder
for (let page = batchStart; page <= batchEnd; page++) {
deletePdfFolder(page);
}

return true;
}

(async () => {
for (let i = startPage; i <= endPage; i += batchSize) {
const batchStart = i;
const batchEnd = Math.min(endPage, i + batchSize - 1);
const success = await runWorkflowBatch(batchStart, batchEnd);
if (!success) {
console.error(`❌ Stopping workflow due to error in batch ${batchStart}-${batchEnd}`);
process.exit(1);
}
}

console.log("\n✅ All batches completed successfully!");
})();
88 changes: 88 additions & 0 deletions 1_fetch_all_dois.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
const fs = require('fs');
const axios = require('axios');
const path = require('path');

const OUTPUT_DIR = path.join(__dirname, 'doi');
const BASE_URL = 'https://api.scai.sh/dois?page=';
const TOTAL_PAGES = 883431;
const DELAY_MS = 2000;

// Parse CLI arguments: --start-page=XX --end-page=XX
const args = process.argv.slice(2);
const getArg = (name) => {
const prefix = `--${name}=`;
const found = args.find(arg => arg.startsWith(prefix));
return found ? parseInt(found.slice(prefix.length), 10) : undefined;
};

const cliStartPage = getArg('start-page');
const cliEndPage = getArg('end-page');

// Ensure output directory exists
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR);
}

// Utility: Delay between requests
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));

// Get last downloaded page number from existing files
function getLastDownloadedPage() {
const files = fs.readdirSync(OUTPUT_DIR);
const pageNumbers = files
.map(file => {
const match = file.match(/page_(\d+)\.json$/);
return match ? parseInt(match[1], 10) : null;
})
.filter(n => n !== null)
.sort((a, b) => a - b);
return pageNumbers.length ? pageNumbers[pageNumbers.length - 1] : 0;
}

// Download a range of pages
async function downloadAllPages(startPage, endPage) {
for (let page = startPage; page <= endPage; page++) {
const filePath = path.join(OUTPUT_DIR, `page_${page}.json`);
if (fs.existsSync(filePath)) {
console.log(`✅ Page ${page} already exists. Skipping.`);
continue;
}

const url = `${BASE_URL}${page}`;
try {
console.log(`🔍 Fetching page ${page}...`);
const res = await axios.get(url);
const data = res.data;

if (data && Array.isArray(data.dois)) {
fs.writeFileSync(filePath, JSON.stringify(data.dois, null, 2));
console.log(`✅ Page ${page} saved (${data.dois.length} DOIs)`);
} else {
console.warn(`⚠️ Page ${page} response missing 'dois' array. Skipping.`);
}
} catch (err) {
console.error(`❌ Failed to fetch page ${page}: ${err.message}`);
console.log('🛑 Stopping script. You can rerun it to resume.');
break;
}

await sleep(DELAY_MS);
}

console.log('🎉 Finished fetching pages.');
}

// Entry point
async function main() {
if (cliStartPage !== undefined && cliEndPage !== undefined) {
console.log(`🚀 Running in range mode: page ${cliStartPage} → ${cliEndPage}`);
await downloadAllPages(cliStartPage, cliEndPage);
} else {
const start = getLastDownloadedPage() + 1;
const end = TOTAL_PAGES;
console.log(`🔁 Resuming from page ${start} → ${end}`);
await downloadAllPages(start, end);
}
}

main();
102 changes: 0 additions & 102 deletions 1_upload_basic_metadata.js

This file was deleted.

Loading