diff --git a/.gitignore b/.gitignore index 9bbd1f2..0cd1ee7 100644 Binary files a/.gitignore and b/.gitignore differ diff --git a/0_generate_basic_metadata.js b/0_generate_basic_metadata.js deleted file mode 100644 index 9e4804f..0000000 --- a/0_generate_basic_metadata.js +++ /dev/null @@ -1,100 +0,0 @@ -const fs = require('fs').promises; -const path = require('path'); - -async function walkDir(dir) { - try { - const files = await fs.readdir(dir); - const jsonFiles = files.filter(file => file.endsWith('.json')); - return jsonFiles.map(file => path.join(dir, file)); - } catch (error) { - console.error('Error reading directory:', error); - throw error; - } -} - -function extractAbstract(paper) { - // Try to reconstruct abstract from inverted index if available - if (paper.openalex?.abstract_inverted_index) { - const words = []; - const index = paper.openalex.abstract_inverted_index; - const maxPosition = Math.max(...Object.values(index).flat()); - - for (let i = 0; i <= maxPosition; i++) { - for (const [word, positions] of Object.entries(index)) { - if (positions.includes(i)) { - words[i] = word; - break; - } - } - } - return words.join(' '); - } - return ""; // Return empty string if no abstract found -} - -function extractBasicMetadata(paper) { - return { - abstract: extractAbstract(paper), - title: paper.openalex?.title || - paper.crossref?.title?.[0] || - "", - authors: paper.openalex?.authorships - ?.map(a => a.raw_author_name) - .join(", ") || - paper.crossref?.author - ?.map(a => `${a.given} ${a.family}`) - .join(", ") || - "", - doi: paper.doi || "", - aid: paper.openalex?.id?.split("/").pop() || - paper.crossref?.DOI?.replace(/[^a-zA-Z0-9]/g, "") || - "" - }; -} - -async function generateBasicMetadata(metadataDir) { - try { - // Get all JSON files in the directory - const files = await walkDir(metadataDir); - - // Process each file - const metadata = []; - for (const file of files) { - try { - console.log(`Processing file: ${file}`); // Add logging - const content = await fs.readFile(file, 'utf8'); - const paper = JSON.parse(content.trim()); // Add trim() to remove any BOM or whitespace - - const basicMetadata = extractBasicMetadata(paper); - metadata.push(basicMetadata); - } catch (error) { - console.error(`Error processing file ${file}:`, error); - // Continue with next file instead of stopping - continue; - } - } - - // Write the results to a file - const outputPath = path.join(process.cwd(), 'basic_metadata.json'); - await fs.writeFile( - outputPath, - JSON.stringify(metadata, null, 2) - ); - - console.log(`Basic metadata generated and saved to ${outputPath}`); - console.log(`Processed ${metadata.length} files successfully`); - return metadata; - } catch (error) { - console.error('Error generating basic metadata:', error); - throw error; - } -} - -// Export the function if using as a module -module.exports = generateBasicMetadata; - -// If running directly -if (require.main === module) { - const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata'); - generateBasicMetadata(metadataDir).catch(console.error); -} diff --git a/0_run_workflow.js b/0_run_workflow.js new file mode 100644 index 0000000..3417867 --- /dev/null +++ b/0_run_workflow.js @@ -0,0 +1,89 @@ +const { execSync } = require("child_process"); +const fs = require("fs"); +const path = require("path"); + +// === CLI Argument Parser === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find((arg) => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const startPage = getArg("start-page"); +const endPage = getArg("end-page"); +const batchSize = getArg("batch-size") || 10; + +if (!startPage || !endPage || isNaN(startPage) || isNaN(endPage)) { + console.error("โŒ Usage: node 0_run_workflow.js --start-page=10 --end-page=100 --batch-size=10"); + process.exit(1); +} + +function deletePdfFolder(page) { + const dirPath = path.join("pdf", `page_${page}`); + if (fs.existsSync(dirPath)) { + const files = fs.readdirSync(dirPath); + for (const file of files) { + if (file.endsWith(".pdf")) { + fs.unlinkSync(path.join(dirPath, file)); + } + } + console.log(`๐Ÿงน Deleted PDF files in folder: ${dirPath}`); + } +} + +async function runWorkflowBatch(batchStart, batchEnd) { + console.log(`\n๐Ÿš€ Starting workflow for pages ${batchStart} - ${batchEnd}\n`); + const steps = [ + { + name: "๐Ÿ“ฅ Step 1๏ธโƒฃ: Fetching DOI JSON...", + command: `node 1_fetch_all_dois.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + { + name: "๐Ÿ“„ Step 2๏ธโƒฃ: Downloading PDFs...", + command: `node 2_fetch_all_pdfs.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + { + name: "๐Ÿง  Step 3๏ธโƒฃ: Generating metadata...", + command: `node 3_generate_basic_metadata.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + { + name: "๐Ÿ†™ Step 4๏ธโƒฃ: Uploading metadata to Irys...", + command: `node 4_upload_all_basic_metadata.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + { + name: "๐Ÿ“ค Step 5๏ธโƒฃ: Uploading PDFs to Irys...", + command: `node 5_upload_all_pdfs.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + ]; + + for (const step of steps) { + console.log(`\n${step.name}`); + try { + execSync(step.command, { stdio: "inherit" }); + } catch (err) { + console.error(`โŒ Step failed: ${err.message}`); + return false; + } + } + + // cleanup pdf files in each page folder + for (let page = batchStart; page <= batchEnd; page++) { + deletePdfFolder(page); + } + + return true; +} + +(async () => { + for (let i = startPage; i <= endPage; i += batchSize) { + const batchStart = i; + const batchEnd = Math.min(endPage, i + batchSize - 1); + const success = await runWorkflowBatch(batchStart, batchEnd); + if (!success) { + console.error(`โŒ Stopping workflow due to error in batch ${batchStart}-${batchEnd}`); + process.exit(1); + } + } + + console.log("\nโœ… All batches completed successfully!"); +})(); diff --git a/1_fetch_all_dois.js b/1_fetch_all_dois.js new file mode 100644 index 0000000..6b60a8f --- /dev/null +++ b/1_fetch_all_dois.js @@ -0,0 +1,88 @@ +const fs = require('fs'); +const axios = require('axios'); +const path = require('path'); + +const OUTPUT_DIR = path.join(__dirname, 'doi'); +const BASE_URL = 'https://api.scai.sh/dois?page='; +const TOTAL_PAGES = 883431; +const DELAY_MS = 2000; + +// Parse CLI arguments: --start-page=XX --end-page=XX +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; + +const cliStartPage = getArg('start-page'); +const cliEndPage = getArg('end-page'); + +// Ensure output directory exists +if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR); +} + +// Utility: Delay between requests +const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + +// Get last downloaded page number from existing files +function getLastDownloadedPage() { + const files = fs.readdirSync(OUTPUT_DIR); + const pageNumbers = files + .map(file => { + const match = file.match(/page_(\d+)\.json$/); + return match ? parseInt(match[1], 10) : null; + }) + .filter(n => n !== null) + .sort((a, b) => a - b); + return pageNumbers.length ? pageNumbers[pageNumbers.length - 1] : 0; +} + +// Download a range of pages +async function downloadAllPages(startPage, endPage) { + for (let page = startPage; page <= endPage; page++) { + const filePath = path.join(OUTPUT_DIR, `page_${page}.json`); + if (fs.existsSync(filePath)) { + console.log(`โœ… Page ${page} already exists. Skipping.`); + continue; + } + + const url = `${BASE_URL}${page}`; + try { + console.log(`๐Ÿ” Fetching page ${page}...`); + const res = await axios.get(url); + const data = res.data; + + if (data && Array.isArray(data.dois)) { + fs.writeFileSync(filePath, JSON.stringify(data.dois, null, 2)); + console.log(`โœ… Page ${page} saved (${data.dois.length} DOIs)`); + } else { + console.warn(`โš ๏ธ Page ${page} response missing 'dois' array. Skipping.`); + } + } catch (err) { + console.error(`โŒ Failed to fetch page ${page}: ${err.message}`); + console.log('๐Ÿ›‘ Stopping script. You can rerun it to resume.'); + break; + } + + await sleep(DELAY_MS); + } + + console.log('๐ŸŽ‰ Finished fetching pages.'); +} + +// Entry point +async function main() { + if (cliStartPage !== undefined && cliEndPage !== undefined) { + console.log(`๐Ÿš€ Running in range mode: page ${cliStartPage} โ†’ ${cliEndPage}`); + await downloadAllPages(cliStartPage, cliEndPage); + } else { + const start = getLastDownloadedPage() + 1; + const end = TOTAL_PAGES; + console.log(`๐Ÿ” Resuming from page ${start} โ†’ ${end}`); + await downloadAllPages(start, end); + } +} + +main(); diff --git a/1_upload_basic_metadata.js b/1_upload_basic_metadata.js deleted file mode 100644 index caf1a99..0000000 --- a/1_upload_basic_metadata.js +++ /dev/null @@ -1,102 +0,0 @@ -require("dotenv").config(); -const { Uploader } = require("@irys/upload"); -const { Solana } = require("@irys/upload-solana"); -const fs = require("fs").promises; -const path = require("path"); - -const getIrysUploader = async () => { - try { - const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); - console.log("Irys uploader initialized."); - return irysUploader; - } catch (error) { - console.error("Failed to initialize Irys uploader:", error); - return null; - } -}; - -const uploadBasicMetadata = async () => { - const irys = await getIrysUploader(); - if (!irys) { - console.error("Irys uploader could not be initialized."); - return; - } - - try { - // Read the basic_metadata.json file - const filePath = path.join(process.cwd(), 'basic_metadata.json'); - console.log(`Reading file: ${filePath}`); - - const content = await fs.readFile(filePath, 'utf8'); - const papers = JSON.parse(content); - - console.log(`Loaded ${papers.length} papers for processing`); - - let successCount = 0; - let failCount = 0; - - for (let i = 0; i < papers.length; i++) { - const paper = papers[i]; - console.log(`\n๐Ÿ“„ Processing paper [${i + 1}/${papers.length}]`); - - if (!paper.doi) { - console.log(`โš ๏ธ Skipping paper: No DOI found`); - failCount++; - continue; - } - - try { - const normalizedDoi = paper.doi.trim(); - const normalizedTitle = paper.title - .replace(/\s+/g, ' ') // Replace multiple spaces with single space - .replace(/\n/g, '') // Remove newlines - .trim(); // Remove leading/trailing spaces - - const normalizedAuthors = paper.authors - .replace(/\s+/g, ' ') - .replace(/\n/g, '') - .trim(); - - const tags = [ - { name: "App-Name", value: "scivault" }, - { name: "Content-Type", value: "application/json" }, - { name: "Version", value: "1.0.3" }, - { name: "doi", value: normalizedDoi }, - { name: "title", value: normalizedTitle }, - { name: "authors", value: normalizedAuthors }, - { name: "aid", value: paper.aid } - ]; - - const paperMetadata = Buffer.from(JSON.stringify(paper)); - const receipt = await irys.upload(paperMetadata, { tags }); - - console.log(`โœ… Uploaded: ${paper.doi} (${receipt.id})`); - successCount++; - - } catch (error) { - console.error(`โŒ Failed: ${paper.doi} - ${error.message}`); - failCount++; - } - - // Progress report every 10 papers - if ((i + 1) % 10 === 0 || i === papers.length - 1) { - console.log(`\n๐Ÿ“Š Progress Report:`); - console.log(` Success: ${successCount}`); - console.log(` Failed: ${failCount}`); - console.log(` Progress: ${Math.round((i + 1) / papers.length * 100)}%`); - } - } - - console.log(`\nโœจ Upload Complete`); - console.log(` Final Results:`); - console.log(` Total Success: ${successCount}`); - console.log(` Total Failed: ${failCount}`); - console.log(` Success Rate: ${Math.round(successCount / papers.length * 100)}%`); - - } catch (error) { - console.error("โŒ Error uploading metadata:", error); - } -}; - -// Run the upload process -uploadBasicMetadata().catch(console.error); diff --git a/2_fetch_all_pdfs.js b/2_fetch_all_pdfs.js new file mode 100644 index 0000000..d05bd18 --- /dev/null +++ b/2_fetch_all_pdfs.js @@ -0,0 +1,165 @@ +const fs = require('fs'); +const path = require('path'); +const https = require('https'); +const axios = require('axios'); + +// === Configuration === +const DOI_DIR = './doi'; +const PDF_DIR = './pdf'; +const SCI_HUB_MIRRORS = [ + 'https://sci-hub.st/', + 'https://sci-hub.se/', + 'https://sci-hub.ru/', + 'https://www.tesble.com/', +]; +const DELAY_MS = 3000; +const MIN_VALID_SIZE = 1024; + +// === CLI Argument Parser === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const cliStart = getArg("start-page"); +const cliEnd = getArg("end-page"); + +// === Utility Functions === +const sleep = ms => new Promise(resolve => setTimeout(resolve, ms)); +function ensureDir(dirPath) { + if (!fs.existsSync(dirPath)) fs.mkdirSync(dirPath, { recursive: true }); +} + +async function downloadPdfFromUrl(url, filePath) { + try { + const writer = fs.createWriteStream(filePath); + const response = await axios({ + url, + method: 'GET', + responseType: 'stream', + httpsAgent: new https.Agent({ rejectUnauthorized: false }) + }); + + response.data.pipe(writer); + return new Promise((resolve, reject) => { + writer.on('finish', () => { + const stats = fs.statSync(filePath); + if (stats.size >= MIN_VALID_SIZE) { + console.log(`โœ… Downloaded: ${url}`); + resolve(true); + } else { + fs.unlinkSync(filePath); + console.warn(`โŒ Download too small: ${url}`); + resolve(false); + } + }); + writer.on('error', reject); + }); + } catch (err) { + console.error(`โŒ Download failed: ${url}`, err.message); + return false; + } +} + +async function extractPdfLinkAndDownload(doi, mirror, outputPath) { + try { + const url = mirror + encodeURIComponent(doi); + const response = await axios.get(url, { httpsAgent: new https.Agent({ rejectUnauthorized: false }) }); + const html = response.data; + + const embedMatch = html.match(/]*src=["']([^"']+\.pdf[^"']*)["']/i); + if (!embedMatch || !embedMatch[1]) { + console.warn(`โŒ No PDF embed found for ${doi}`); + return false; + } + + let pdfUrl = embedMatch[1]; + if (pdfUrl.startsWith('//')) { + pdfUrl = 'https:' + pdfUrl; + } else if (!pdfUrl.startsWith('http')) { + pdfUrl = mirror + (pdfUrl.startsWith('/') ? pdfUrl.slice(1) : pdfUrl); + } + + return await downloadPdfFromUrl(pdfUrl, outputPath); + } catch (err) { + console.warn(`โŒ Error scraping ${mirror} for ${doi}: ${err.message}`); + return false; + } +} + +async function tryAllMirrors(doi, outputPath) { + for (const mirror of SCI_HUB_MIRRORS) { + const success = await extractPdfLinkAndDownload(doi, mirror, outputPath); + if (success) return true; + await sleep(1000); + } + return false; +} + +async function processPage(pageFile) { + const pageNum = pageFile.match(/\d+/)[0]; + const doiPath = path.join(DOI_DIR, pageFile); + const outDir = path.join(PDF_DIR, `page_${pageNum}`); + ensureDir(outDir); + + const failedLogPath = path.join(outDir, `failed_log_page_${pageNum}.txt`); + let failedDois = new Set(); + if (fs.existsSync(failedLogPath)) { + failedDois = new Set(fs.readFileSync(failedLogPath, 'utf8').split('\n').filter(Boolean)); + } + + const dois = JSON.parse(fs.readFileSync(doiPath, 'utf8')); + + for (const doi of dois) { + const doiSafe = encodeURIComponent(doi); + const pdfPath = path.join(outDir, `${doiSafe}.pdf`); + + if (fs.existsSync(pdfPath)) { + const stats = fs.statSync(pdfPath); + if (stats.size >= MIN_VALID_SIZE) { + console.log(`โœ… Already exists: ${pdfPath}`); + continue; + } else { + console.warn(`โš ๏ธ Removing invalid file: ${pdfPath}`); + fs.unlinkSync(pdfPath); + } + } + + if (failedDois.has(doi)) { + console.log(`โš ๏ธ Previously failed: ${doi}, skipping`); + continue; + } + + console.log(`๐Ÿ“„ Downloading DOI: ${doi}`); + const success = await tryAllMirrors(doi, pdfPath); + if (!success) { + fs.appendFileSync(failedLogPath, `${doi}\n`); + console.error(`โŒ Failed to download ${doi}`); + } + + await sleep(DELAY_MS); + } +} + +async function main() { + ensureDir(PDF_DIR); + + const pageFiles = fs.readdirSync(DOI_DIR) + .filter(f => f.startsWith('page_') && f.endsWith('.json')) + .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0])); + + const filtered = pageFiles.filter(f => { + const page = parseInt(f.match(/\d+/)[0], 10); + return (!cliStart || page >= cliStart) && (!cliEnd || page <= cliEnd); + }); + + for (const file of filtered) { + console.log(`\n=== Processing ${file} ===`); + await processPage(file); + } + + console.log('\n๐ŸŽ‰ All requested PDF downloads finished.'); +} + +main(); diff --git a/2_upload_pdf.js b/2_upload_pdf.js deleted file mode 100644 index d468847..0000000 --- a/2_upload_pdf.js +++ /dev/null @@ -1,248 +0,0 @@ -require("dotenv").config(); -const { Uploader } = require("@irys/upload"); -const { Solana } = require("@irys/upload-solana"); -const { PDFDocument } = require("pdf-lib"); -const fs = require("fs").promises; -const path = require("path"); - -const MAX_SLICE_SIZE = 50 * 1024; // 50KB per slice - -const getIrysUploader = async () => { - try { - const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); - console.log("Irys uploader initialized."); - return irysUploader; - } catch (error) { - console.error("Failed to initialize Irys uploader:", error); - return null; - } -}; - -async function walkDir(dir) { - try { - const files = await fs.readdir(dir); - const pdfFiles = files.filter(file => file.toLowerCase().endsWith('.pdf')); - return pdfFiles.map(file => path.join(dir, file)); - } catch (error) { - console.error('Error reading directory:', error); - throw error; - } -} - -async function getDoiFromMetadata(pdfPath) { - try { - // Get the corresponding JSON file path by replacing .pdf with .json - const jsonPath = pdfPath.replace('.pdf', '.json'); - - console.log(`Looking for metadata file: ${jsonPath}`); - - // Read and parse the JSON file - const jsonData = await fs.readFile(jsonPath, 'utf8'); - const metadata = JSON.parse(jsonData); - - if (!metadata.doi) { - throw new Error(`No DOI found in metadata file: ${jsonPath}`); - } - - console.log(`Found DOI: ${metadata.doi}`); - return metadata.doi; - } catch (error) { - console.error(`Error getting DOI from metadata:`, error); - throw error; - } -} - -const sliceAndUploadPdf = async (inputPath, doi) => { - try { - console.log(`\n๐Ÿ“„ Processing PDF: ${path.basename(inputPath)}`); - - // Read and validate PDF - const pdfBytes = await fs.readFile(inputPath); - const pdfDoc = await PDFDocument.load(pdfBytes); - const fileBase64 = await pdfDoc.saveAsBase64(); - - // Create chunks - const chunks = []; - for (let i = 0; i < fileBase64.length; i += MAX_SLICE_SIZE) { - const chunk = fileBase64.slice(i, i + MAX_SLICE_SIZE); - chunks.push(chunk); - } - - console.log(`File size: ${fileBase64.length} bytes`); - console.log(`Total chunks: ${chunks.length}`); - - // Check if PDF was already uploaded - const query = ` - query { - transactions( - tags: [ - { name: "Content-Type", values: ["application/pdf"] }, - { name: "application", values: ["scivault"] }, - { name: "Version", values: ["1.0.3"] }, - { name: "Type", values: ["pdf-index"] }, - { name: "Collection", values: ["${doi}"] } - ] - ) { - edges { - node { - id - } - } - } - } - `; - - const response = await fetch("https://uploader.irys.xyz/graphql", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query }) - }); - - const result = await response.json(); - if (result.data?.transactions?.edges?.[0]?.node?.id) { - console.log(`โš ๏ธ PDF already exists for DOI: ${doi}`); - return result.data.transactions.edges.map(edge => edge.node.id); - } - - // Upload chunks - const irys = await getIrysUploader(); - if (!irys) { - throw new Error("Failed to initialize Irys uploader"); - } - - const receiptIDs = []; - const tags = [ - { name: "Content-Type", value: "application/pdf" }, - { name: "application", value: "scivault" }, - { name: "Version", value: "1.0.3" }, - { name: "Type", value: "pdf-index" }, - { name: "Collection", value: doi } - ]; - - for (let i = 0; i < chunks.length; i++) { - console.log(`\nUploading chunk ${i + 1}/${chunks.length}...`); - const receipt = await irys.upload(Buffer.from(chunks[i]), { tags }); - receiptIDs.push(receipt.id); - console.log(`โœ… Chunk uploaded: ${receipt.id}`); - } - - console.log(`\nโœจ PDF uploaded successfully!`); - console.log(`Receipt IDs: ${receiptIDs.join(", ")}`); - return receiptIDs; - - } catch (error) { - console.error(`โŒ Error processing PDF: ${error.message}`); - throw error; - } -}; - -// ๆทปๅŠ ้”™่ฏฏๆ—ฅๅฟ—ๅŠŸ่ƒฝ -async function logError(filePath, error, doi = null) { - const errorLogPath = path.join(process.cwd(), 'upload_errors.json'); - try { - // ่ฏปๅ–็Žฐๆœ‰็š„้”™่ฏฏๆ—ฅๅฟ—๏ผŒๅฆ‚ๆžœไธๅญ˜ๅœจๅˆ™ๅˆ›ๅปบๆ–ฐ็š„ - let errorLog = []; - try { - const existingLog = await fs.readFile(errorLogPath, 'utf8'); - errorLog = JSON.parse(existingLog); - } catch (e) { - // ๆ–‡ไปถไธๅญ˜ๅœจ๏ผŒไฝฟ็”จ็ฉบๆ•ฐ็ป„ - } - - // ๆทปๅŠ ๆ–ฐ็š„้”™่ฏฏ่ฎฐๅฝ• - errorLog.push({ - timestamp: new Date().toISOString(), - file: filePath, - doi: doi, - error: error.message || String(error), - stack: error.stack - }); - - // ไฟๅญ˜ๆ›ดๆ–ฐๅŽ็š„้”™่ฏฏๆ—ฅๅฟ— - await fs.writeFile(errorLogPath, JSON.stringify(errorLog, null, 2)); - console.log(`Error logged to ${errorLogPath}`); - } catch (logError) { - console.error('Failed to log error:', logError); - } -} - -const uploadPdfs = async (pdfDir) => { - try { - const files = await walkDir(pdfDir); - console.log(`Found ${files.length} PDF files in ${pdfDir}`); - - let successCount = 0; - let failCount = 0; - let errorFiles = []; - - for (let i = 0; i < files.length; i++) { - const pdfFile = files[i]; - let doi = null; - try { - // ่Žทๅ– DOI - doi = await getDoiFromMetadata(pdfFile); - console.log(`\nProcessing PDF: ${path.basename(pdfFile)}`); - console.log(`Using DOI: ${doi}`); - - // ๅฐ่ฏ•ไธŠไผ  - await sliceAndUploadPdf(pdfFile, doi); - successCount++; - } catch (error) { - failCount++; - await logError(pdfFile, error, doi); - errorFiles.push({ - file: pdfFile, - doi: doi, - error: error.message - }); - } - - // Progress report - if ((i + 1) % 5 === 0 || i === files.length - 1) { - console.log(`\n๐Ÿ“Š Progress Report:`); - console.log(` Success: ${successCount}`); - console.log(` Failed: ${failCount}`); - console.log(` Progress: ${Math.round((i + 1) / files.length * 100)}%`); - } - } - - // ๅœจๅฎŒๆˆๆ—ถ็”Ÿๆˆ่ฏฆ็ป†ๆŠฅๅ‘Š - const report = { - timestamp: new Date().toISOString(), - totalFiles: files.length, - successCount, - failCount, - successRate: `${Math.round(successCount / files.length * 100)}%`, - failedFiles: errorFiles - }; - - // ไฟๅญ˜ๆŠฅๅ‘Š - const reportPath = path.join(process.cwd(), 'upload_report.json'); - await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); - - console.log(`\n๐ŸŽ‰ Upload Complete`); - console.log(` Total Success: ${successCount}`); - console.log(` Total Failed: ${failCount}`); - console.log(` Success Rate: ${Math.round(successCount / files.length * 100)}%`); - console.log(` Detailed report saved to: ${reportPath}`); - if (failCount > 0) { - console.log(` Error log saved to: upload_errors.json`); - } - - } catch (error) { - console.error("โŒ Error in upload process:", error); - await logError('global', error); - } -}; - -// If running directly -if (require.main === module) { - const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata'); - uploadPdfs(metadataDir).catch(console.error); -} - -module.exports = { - getIrysUploader, - sliceAndUploadPdf, - uploadPdfs -}; diff --git a/3_generate_basic_metadata.js b/3_generate_basic_metadata.js new file mode 100644 index 0000000..26335f5 --- /dev/null +++ b/3_generate_basic_metadata.js @@ -0,0 +1,99 @@ +const fs = require('fs'); +const path = require('path'); +const axios = require('axios'); + +// === Configuration === +const PDF_BASE_DIR = './pdf'; +const OPENALEX_BASE_URL = 'https://api.openalex.org/works/doi:'; +const DELAY_MS = 1500; + +// === CLI Argument Parser === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const cliStart = getArg("start-page"); +const cliEnd = getArg("end-page"); + +// === Utilities === +const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + +// Convert inverted index to plain abstract text +const parseAbstract = (index) => { + if (!index || typeof index !== 'object') return ''; + const words = []; + for (const [word, positions] of Object.entries(index)) { + positions.forEach(pos => { + words[pos] = word; + }); + } + return words.join(' '); +}; + +// Extract only essential metadata fields +const extractMetadata = (data) => { + const title = data.title || data.display_name || ''; + const authors = (data.authorships || []) + .map(a => a.author?.display_name) + .filter(Boolean) + .join(', '); + const abstract = parseAbstract(data.abstract_inverted_index); + const doi = data.doi?.replace('https://doi.org/', '') || ''; + const aid = data.id?.replace('https://openalex.org/', '') || ''; + return { title, authors, abstract, doi, aid }; +}; + +// Process all PDFs in a single page folder +async function generateMetadataForPage(pageDir) { + const pageNum = pageDir.match(/\d+/)[0]; + console.log(`\n๐Ÿ“ Processing folder: page_${pageNum}`); + + const pdfFiles = fs.readdirSync(pageDir).filter(f => f.endsWith('.pdf')); + const metadataList = []; + + for (const file of pdfFiles) { + const doiEncoded = file.replace(/\.pdf$/, ''); + const doi = decodeURIComponent(doiEncoded); + const openalexUrl = `${OPENALEX_BASE_URL}${doi}`; + + try { + console.log(`๐Ÿ” Fetching metadata for DOI: ${doi}`); + const response = await axios.get(openalexUrl); + const metadata = extractMetadata(response.data); + metadataList.push(metadata); + } catch (error) { + console.warn(`โš ๏ธ Failed to fetch metadata for ${doi}: ${error.message}`); + } + + await sleep(DELAY_MS); + } + + const outputPath = path.join(pageDir, 'basic_metadata.json'); + fs.writeFileSync(outputPath, JSON.stringify(metadataList, null, 2)); + console.log(`โœ… Saved metadata to ${outputPath}`); +} + +// === Main Function === +async function main() { + const subdirs = fs.readdirSync(PDF_BASE_DIR) + .filter(d => d.startsWith('page_')) + .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0])) + .filter(d => { + const page = parseInt(d.match(/\d+/)[0], 10); + if (cliStart && page < cliStart) return false; + if (cliEnd && page > cliEnd) return false; + return true; + }) + .map(d => path.join(PDF_BASE_DIR, d)) + .filter(d => fs.statSync(d).isDirectory()); + + for (const pageDir of subdirs) { + await generateMetadataForPage(pageDir); + } + + console.log('\n๐ŸŽ‰ Metadata generation completed for all selected folders.'); +} + +main(); diff --git a/3_upload_all_metadata.js b/3_upload_all_metadata.js deleted file mode 100644 index 2947955..0000000 --- a/3_upload_all_metadata.js +++ /dev/null @@ -1,200 +0,0 @@ -require("dotenv").config(); -const { Uploader } = require("@irys/upload"); -const { Solana } = require("@irys/upload-solana"); -const fs = require("fs").promises; -const path = require("path"); - -const getIrysUploader = async () => { - try { - const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); - console.log("Irys uploader initialized."); - return irysUploader; - } catch (error) { - console.error("Failed to initialize Irys uploader:", error); - return null; - } -}; - -async function walkDir(dir) { - try { - const files = await fs.readdir(dir); - const jsonFiles = files.filter(file => file.toLowerCase().endsWith('.json')); - return jsonFiles.map(file => path.join(dir, file)); - } catch (error) { - console.error('Error reading directory:', error); - throw error; - } -} - -async function uploadMetadata(jsonPath) { - try { - console.log(`\n๐Ÿ“„ Processing metadata: ${path.basename(jsonPath)}`); - - // Read and parse JSON file - const jsonData = await fs.readFile(jsonPath, 'utf8'); - const metadata = JSON.parse(jsonData); - - if (!metadata.doi) { - throw new Error(`No DOI found in metadata file: ${jsonPath}`); - } - - // Check if metadata was already uploaded - const query = ` - query { - transactions( - tags: [ - { name: "Content-Type", values: ["metadata/json"] }, - { name: "App-Name", values: ["scivault"] }, - { name: "Version", values: ["1.0.3"] }, - { name: "doi", values: ["${metadata.doi}"] } - ] - ) { - edges { - node { - id - } - } - } - } - `; - - const response = await fetch("https://uploader.irys.xyz/graphql", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query }) - }); - - const result = await response.json(); - if (result.data?.transactions?.edges?.[0]?.node?.id) { - console.log(`โš ๏ธ Metadata already exists for DOI: ${metadata.doi}`); - return result.data.transactions.edges[0].node.id; - } - - // Upload metadata - const irys = await getIrysUploader(); - if (!irys) { - throw new Error("Failed to initialize Irys uploader"); - } - - const tags = [ - { name: "Content-Type", value: "metadata/json" }, - { name: "App-Name", value: "scivault" }, - { name: "Version", value: "1.0.3" } - ]; - - for (const [key, value] of Object.entries(metadata)) { - if (value && typeof value === 'string') { - tags.push({ name: key, value: value }); - } - } - - const receipt = await irys.upload(jsonData, { tags }); - console.log(`โœ… Metadata uploaded: ${receipt.id}`); - return receipt.id; - - } catch (error) { - console.error(`โŒ Error processing metadata: ${error.message}`); - throw error; - } -} - -async function logError(filePath, error, doi = null) { - const errorLogPath = path.join(process.cwd(), 'metadata_upload_errors.json'); - try { - let errorLog = []; - try { - const existingLog = await fs.readFile(errorLogPath, 'utf8'); - errorLog = JSON.parse(existingLog); - } catch (e) { - // File doesn't exist, use empty array - } - - errorLog.push({ - timestamp: new Date().toISOString(), - file: filePath, - doi: doi, - error: error.message || String(error), - stack: error.stack - }); - - await fs.writeFile(errorLogPath, JSON.stringify(errorLog, null, 2)); - console.log(`Error logged to ${errorLogPath}`); - } catch (logError) { - console.error('Failed to log error:', logError); - } -} - -const uploadAllMetadata = async (metadataDir) => { - try { - const files = await walkDir(metadataDir); - console.log(`Found ${files.length} JSON files in ${metadataDir}`); - - let successCount = 0; - let failCount = 0; - let errorFiles = []; - - for (let i = 0; i < files.length; i++) { - const jsonFile = files[i]; - let doi = null; - try { - const jsonData = await fs.readFile(jsonFile, 'utf8'); - const metadata = JSON.parse(jsonData); - doi = metadata.doi; - - await uploadMetadata(jsonFile); - successCount++; - } catch (error) { - failCount++; - await logError(jsonFile, error, doi); - errorFiles.push({ - file: jsonFile, - doi: doi, - error: error.message - }); - } - - if ((i + 1) % 5 === 0 || i === files.length - 1) { - console.log(`\n๐Ÿ“Š Progress Report:`); - console.log(` Success: ${successCount}`); - console.log(` Failed: ${failCount}`); - console.log(` Progress: ${Math.round((i + 1) / files.length * 100)}%`); - } - } - - const report = { - timestamp: new Date().toISOString(), - totalFiles: files.length, - successCount, - failCount, - successRate: `${Math.round(successCount / files.length * 100)}%`, - failedFiles: errorFiles - }; - - const reportPath = path.join(process.cwd(), 'metadata_upload_report.json'); - await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); - - console.log(`\n๐ŸŽ‰ Upload Complete`); - console.log(` Total Success: ${successCount}`); - console.log(` Total Failed: ${failCount}`); - console.log(` Success Rate: ${Math.round(successCount / files.length * 100)}%`); - console.log(` Detailed report saved to: ${reportPath}`); - if (failCount > 0) { - console.log(` Error log saved to: metadata_upload_errors.json`); - } - - } catch (error) { - console.error("โŒ Error in upload process:", error); - await logError('global', error); - } -}; - -if (require.main === module) { - const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata'); - uploadAllMetadata(metadataDir).catch(console.error); -} - -module.exports = { - getIrysUploader, - uploadMetadata, - uploadAllMetadata -}; diff --git a/4_upload_all_basic_metadata.js b/4_upload_all_basic_metadata.js new file mode 100644 index 0000000..7184ca2 --- /dev/null +++ b/4_upload_all_basic_metadata.js @@ -0,0 +1,131 @@ +require("dotenv").config(); +const { Uploader } = require("@irys/upload"); +const { Solana } = require("@irys/upload-solana"); +const fs = require("fs").promises; +const path = require("path"); + +// === Configuration === +const PDF_BASE_DIR = './pdf'; +const REPORT_FILENAME = 'upload_basic_metadata_report.txt'; + +// === CLI Argument Parser === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const cliStart = getArg("start-page"); +const cliEnd = getArg("end-page"); + +// === Initialize Irys uploader === +const getIrysUploader = async () => { + try { + const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); + console.log("โœ… Irys uploader initialized."); + return irysUploader; + } catch (error) { + console.error("โŒ Failed to initialize Irys uploader:", error); + return null; + } +}; + +// === Upload a single paper === +const uploadOneMetadata = async (irys, paper, pageNum, index) => { + if (!paper.doi) { + console.log(`โš ๏ธ Skipping paper at page ${pageNum}, index ${index}: No DOI`); + return { ok: false, reason: 'no-doi' }; + } + + try { + const normalizedDoi = paper.doi.trim(); + const normalizedTitle = (paper.title || "").replace(/\s+/g, ' ').trim(); + const normalizedAuthors = (paper.authors || "").replace(/\s+/g, ' ').trim(); + + const tags = [ + { name: "App-Name", value: "scivault" }, + { name: "Content-Type", value: "application/json" }, + { name: "Version", value: "2.0.0" }, + { name: "doi", value: normalizedDoi }, + { name: "title", value: normalizedTitle }, + { name: "authors", value: normalizedAuthors }, + { name: "aid", value: paper.aid || "" } + ]; + + const buffer = Buffer.from(JSON.stringify(paper)); + const receipt = await irys.upload(buffer, { tags }); + + console.log(`โœ… Uploaded [page_${pageNum} - ${index}]: ${normalizedDoi} (${receipt.id})`); + return { ok: true, id: receipt.id }; + } catch (err) { + console.error(`โŒ Upload failed [page_${pageNum} - ${index}]: ${paper.doi} - ${err.message}`); + return { ok: false, reason: err.message }; + } +}; + +// === Process one page folder === +const uploadPageFolder = async (irys, pageDir) => { + const pageNum = pageDir.match(/\d+/)?.[0] || '?'; + const metaPath = path.join(PDF_BASE_DIR, pageDir, 'basic_metadata.json'); + const reportPath = path.join(PDF_BASE_DIR, pageDir, REPORT_FILENAME); + + try { + await fs.access(metaPath); + } catch { + console.warn(`โš ๏ธ Skipping page_${pageNum}: no basic_metadata.json`); + return; + } + + const jsonText = await fs.readFile(metaPath, 'utf8'); + const papers = JSON.parse(jsonText); + + console.log(`\n๐Ÿ“„ Found ${papers.length} papers in page_${pageNum}`); + const reportLines = []; + + let success = 0; + let fail = 0; + + for (let i = 0; i < papers.length; i++) { + const result = await uploadOneMetadata(irys, papers[i], pageNum, i); + const doi = papers[i].doi || '[no-doi]'; + + if (result.ok) { + success++; + reportLines.push(`โœ… ${doi} : ${result.id}`); + } else { + fail++; + reportLines.push(`โŒ ${doi} : ${result.reason}`); + } + + if ((i + 1) % 10 === 0 || i === papers.length - 1) { + console.log(`๐Ÿ“Š page_${pageNum} progress: ${i + 1}/${papers.length}, โœ… ${success}, โŒ ${fail}`); + } + } + + await fs.writeFile(reportPath, reportLines.join('\n'), 'utf8'); + console.log(`๐Ÿ“„ Upload report saved: ${reportPath}`); + console.log(`โœจ Finished page_${pageNum}: โœ… ${success}, โŒ ${fail}`); +}; + +// === Main Execution === +(async () => { + const irys = await getIrysUploader(); + if (!irys) return; + + const dirs = await fs.readdir(PDF_BASE_DIR); + const pageDirs = dirs + .filter(d => d.startsWith('page_')) + .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0])) + .filter(d => { + const page = parseInt(d.match(/\d+/)[0], 10); + if (cliStart && page < cliStart) return false; + if (cliEnd && page > cliEnd) return false; + return true; + }); + + for (const pageDir of pageDirs) { + await uploadPageFolder(irys, pageDir); + } + + console.log('\n๐ŸŽ‰ All basic metadata uploads completed.'); +})(); diff --git a/5_upload_all_pdfs.js b/5_upload_all_pdfs.js new file mode 100644 index 0000000..a38880c --- /dev/null +++ b/5_upload_all_pdfs.js @@ -0,0 +1,168 @@ +require("dotenv").config(); +const { Uploader } = require("@irys/upload"); +const { Solana } = require("@irys/upload-solana"); +const fs = require("fs").promises; +const path = require("path"); + +// === CONFIG === +const BASE_PDF_DIR = path.join(process.cwd(), "pdf"); +const MIN_VALID_SIZE = 1000; // in bytes +const REPORT_PREFIX = "upload_pdf_report"; + +// === CLI === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const cliStart = getArg("start-page"); +const cliEnd = getArg("end-page"); + +// === Uploader === +const getIrysUploader = async () => { + try { + const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); + console.log("โœ… Irys uploader initialized."); + return irysUploader; + } catch (error) { + console.error("โŒ Failed to initialize Irys uploader:", error); + return null; + } +}; + +// === DOI Utilities === +function extractDoiFromFilename(filename) { + const base = path.basename(filename, ".pdf"); + return decodeURIComponent(base).replace(/%2F/g, "/").trim(); +} + +// === Check existing upload === +async function checkIfAlreadyUploaded(doi) { + const query = ` + query { + transactions( + tags: [ + { name: "App-Name", values: ["scivault"] }, + { name: "Content-Type", values: ["application/pdf"] }, + { name: "Version", values: ["2.0.0"] }, + { name: "doi", values: ["${doi}"] } + ] + ) { + edges { + node { id } + } + } + } + `; + + const response = await fetch("https://uploader.irys.xyz/graphql", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ query }) + }); + + const result = await response.json(); + return result.data?.transactions?.edges?.[0]?.node?.id || null; +} + +// === Upload one PDF === +async function uploadOnePdf(irys, filePath) { + try { + const doi = extractDoiFromFilename(filePath); + if (!doi) throw new Error("Invalid DOI from filename"); + + const alreadyUploaded = await checkIfAlreadyUploaded(doi); + if (alreadyUploaded) { + console.log(`โš ๏ธ Already uploaded: ${doi}`); + return { status: "skip", doi }; + } + + const buffer = await fs.readFile(filePath); + if (buffer.length < MIN_VALID_SIZE) { + throw new Error("File too small (<1KB)"); + } + + const tags = [ + { name: "App-Name", value: "scivault" }, + { name: "Content-Type", value: "application/pdf" }, + { name: "Version", value: "2.0.0" }, + { name: "doi", value: doi } + ]; + + const receipt = await irys.upload(buffer, { tags }); + console.log(`โœ… Uploaded ${doi} - ${receipt.id}`); + return { status: "ok", doi, id: receipt.id }; + } catch (error) { + console.error(`โŒ Failed upload: ${filePath} - ${error.message}`); + return { status: "fail", file: filePath, error: error.message }; + } +} + +// === Process one page folder === +async function processPageFolder(irys, pageDir) { + const pageNum = pageDir.match(/page_(\d+)/)?.[1]; + const files = await fs.readdir(pageDir); + const pdfFiles = files.filter(f => f.endsWith(".pdf")); + + console.log(`๐Ÿ“‚ Processing page_${pageNum} - Found ${pdfFiles.length} PDFs`); + + const result = { ok: [], fail: [], skip: [] }; + + for (let i = 0; i < pdfFiles.length; i++) { + const file = pdfFiles[i]; + const filePath = path.join(pageDir, file); + const res = await uploadOnePdf(irys, filePath); + + if (res.status === "ok") result.ok.push(res); + else if (res.status === "fail") result.fail.push(res); + else if (res.status === "skip") result.skip.push(res); + + if ((i + 1) % 10 === 0 || i === pdfFiles.length - 1) { + console.log(`๐Ÿ“Š Progress: ${i + 1}/${pdfFiles.length}`); + } + } + + // Save report + const report = { + page: `page_${pageNum}`, + timestamp: new Date().toISOString(), + total: pdfFiles.length, + success: result.ok.length, + failed: result.fail.length, + skipped: result.skip.length, + successRate: `${Math.round((result.ok.length / pdfFiles.length) * 100)}%`, + details: result + }; + + const reportPath = path.join(pageDir, `${REPORT_PREFIX}_page_${pageNum}.json`); + await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); + console.log(`๐Ÿ“ Report saved to ${reportPath}`); +} + +// === Main === +(async () => { + const irys = await getIrysUploader(); + if (!irys) return; + + const dirs = await fs.readdir(BASE_PDF_DIR); + const pageDirs = dirs + .filter(d => d.startsWith("page_")) + .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0])) + .filter(d => { + const page = parseInt(d.match(/\d+/)[0]); + if (cliStart && page < cliStart) return false; + if (cliEnd && page > cliEnd) return false; + return true; + }); + + for (const dir of pageDirs) { + const fullPath = path.join(BASE_PDF_DIR, dir); + const stat = await fs.lstat(fullPath); + if (stat.isDirectory()) { + await processPageFolder(irys, fullPath); + } + } + + console.log("\n๐ŸŽ‰ All PDF uploads completed."); +})(); diff --git a/9_fund.js b/9_fund.js new file mode 100644 index 0000000..2efad66 --- /dev/null +++ b/9_fund.js @@ -0,0 +1,51 @@ +require("dotenv").config(); +const readline = require("readline"); +const { Uploader } = require("@irys/upload"); +const { Solana } = require("@irys/upload-solana"); + +const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout +}); + +const askUser = (question) => { + return new Promise((resolve) => { + rl.question(question, (answer) => { + resolve(answer.trim().toLowerCase()); + }); + }); +}; + +const main = async () => { + try { + const irys = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); + + const address = irys.address; + const token = irys.token; + + const atomicBalance = await irys.getLoadedBalance(); + const balance = irys.utils.fromAtomic(atomicBalance); + + console.log(`\n๐ŸŒ Public Address: ${address}`); + console.log(`๐Ÿ’ฐ Current Irys Balance: ${balance} ${token}`); + console.log(`๐Ÿ”— Check wallet on Solana Explorer: https://explorer.solana.com/address/${address}?cluster=mainnet`); + + const answer = await askUser("\n๐Ÿช™ Do you want to fund 0.01 SOL to Irys? (yes/no): "); + + if (answer === "yes" || answer === "y") { + const amount = "0.01"; + console.log(`\nโ›ฝ Funding ${amount} SOL to Irys...`); + + const fundResult = await irys.fund(irys.utils.toAtomic(amount)); + console.log(`โœ… Fund successful! Transaction ID: ${fundResult.id}`); + } else { + console.log("โ„น๏ธ Funding skipped."); + } + } catch (err) { + console.error("โŒ Failed to get balance or fund Irys:", err); + } finally { + rl.close(); + } +}; + +main(); diff --git a/README.md b/README.md index c626acd..5677cf8 100644 --- a/README.md +++ b/README.md @@ -1,92 +1,97 @@ -# uploader for SciBox +# ๐Ÿ“„ SciUploader โ€“ Bulk Sci-Hub PDF Downloader -A decentralized academic paper repository system built on Arweave/Irys. +This tool automates the batch download of academic papers from Sci-Hub using DOIs and organizes the PDFs for further metadata processing and decentralized storage Irys. -## Prerequisites +--- -1. Node.js (v16 or higher) -2. Solana wallet with SOL tokens -3. Create a `.env` file with your Solana private key: - ``` - PRIVATE_KEY=your_solana_private_key_here - ``` +## ๐Ÿ“ฆ Project Structure -## Installation +``` +sciuploader/ +โ”œโ”€โ”€ doi/ โ† Each page_N.json contains a list of DOIs +โ”œโ”€โ”€ pdf/ โ† Downloaded PDFs organized by page +โ”œโ”€โ”€ 0_run_workflow.js โ† Run full workflow script +โ”œโ”€โ”€ 1_fetch_all_dois.js โ† Fetch DOI list from external source +โ”œโ”€โ”€ 2_fetch_all_pdfs.js โ† Download PDFs using DOI list +โ”œโ”€โ”€ 3_generate_basic_metadata.js โ† Generate basic metadata JSON +โ”œโ”€โ”€ 4_upload_all_basic_metadata.js โ† Upload metadata to decentralized storage (TBD) +โ”œโ”€โ”€ 5_upload_all_pdfs.js โ† Upload PDFs to decentralized storage (TBD) +โ”œโ”€โ”€ 9_fund.js โ† Funding registration or helper functions +โ”œโ”€โ”€ .env.example โ† Example environment configuration +โ””โ”€โ”€ README.md โ† This file +``` + +--- -1. Clone this repository: - ```bash - git clone https://github.com/yourusername/scivault.git - cd sciuploader - ``` +## โœ… How to Use -2. Install dependencies: - ```bash - npm install - ``` +### 1. Install dependencies -## Usage +```bash +npm install +``` -### Step 0: Prepare Your Data +### 2. Set environment variables (optional) -1. Create a `metadata` folder in the project root -2. Place your metadata JSON files and corresponding PDFs in this folder - - Each PDF should have a matching JSON file with the same name (e.g., `paper1.pdf` and `paper1.json`) - - JSON files must contain a `doi` field -3. Run the metadata generator: - ```bash - node 0_generate_basic_metadata.js - ``` - This will create a `basic_metadata.json` file containing essential paper information. +Copy `.env.example` to `.env` and fill in any required values (e.g., upload keys for later stages). + +--- + +### 3. Run full workflow + +```bash +node 0_run_workflow.js +``` -### Step 1: Upload Basic Metadata +for dividing tasks, +add --start-page=3 --end-page=4 like this, there are total 883431 pages -Upload the basic metadata (title, authors, DOI, etc.): ```bash -node 1_upload_basic_metadata.js +node 0_run_workflow.js --start-page=300000 --end-page=400000 --batch-size=10 ``` -### Step 2: Upload PDFs -Upload PDFs (they will be automatically split into chunks): +Or run step-by-step: + +--- + +### โ—พ๏ธ Step 1: Fetch all DOIs (optional) + ```bash -node 2_upload_pdf.js +node 1_fetch_all_dois.js ``` -Note: If uploads fail due to network issues, you can safely run the script again. It will skip already uploaded files and continue with failed ones. +This fetches DOIs from an API and saves them into `doi/page_N.json` files. + +--- -### Step 3: Upload Complete Metadata +### โ—พ๏ธ Step 2: Download all PDFs -Upload the complete metadata with all paper details: ```bash -node 3_upload_all_metadata.js +node 2_fetch_all_pdfs.js --start-page=1 --end-page=10 ``` -## Version Control +- Failed downloads are logged to `failed_log_page_N.txt` per page. +- Already downloaded and valid files are skipped. -The system uses semantic versioning for content management: -- Current version: `1.0.3` -- Format: `MAJOR.MINOR.PATCH` - - MAJOR: Breaking changes - - MINOR: New features - - PATCH: Bug fixes +--- -When uploading content, ensure you're using the correct version in the tags. +### โ—พ๏ธ Step 3: Generate basic metadata -## Error Handling +```bash +node 3_generate_basic_metadata.js +``` -- Each upload script generates detailed logs: - - `upload_report.json`: Summary of upload results - - `upload_errors.json`: Details of failed uploads -- Failed uploads can be retried by running the script again -- The system checks for existing uploads to avoid duplicates +--- -## Web Interface +### โ—พ๏ธ Step 4 & 5: Upload -The `queryweb` folder contains a simple web interface for searching and viewing papers: -- Search by DOI, title, or arXiv ID -- View paper metadata -- Download PDF files +```bash +node 4_upload_all_basic_metadata.js +node 5_upload_all_pdfs.js +``` +--- -## License +## ๐Ÿ“œ License -MIT \ No newline at end of file +MIT \ No newline at end of file diff --git a/package.json b/package.json index 06f3ca5..03008c1 100644 --- a/package.json +++ b/package.json @@ -12,13 +12,17 @@ "dependencies": { "@irys/upload": "^0.0.14", "@irys/upload-solana": "^0.1.7", + "axios": "^1.9.0", "bignumber.js": "^9.1.2", "cors": "^2.8.5", "dotenv": "^16.4.7", "express": "^4.17.1", + "jsdom": "^26.1.0", + "minimist": "^1.2.8", "node-fetch": "^2.7.0", "pdf-lib": "^1.17.1", - "pdfkit": "^0.16.0" + "pdfkit": "^0.16.0", + "puppeteer": "^24.10.0" }, "keywords": [ "arweave", @@ -28,5 +32,13 @@ "decentralized" ], "author": "SciVault", - "license": "MIT" -} \ No newline at end of file + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/Scihub-Community/sciuploader.git" + }, + "bugs": { + "url": "https://github.com/Scihub-Community/sciuploader/issues" + }, + "homepage": "https://github.com/Scihub-Community/sciuploader#readme" +}