Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions dspace/bin/load-etd
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ $ENV{CLASSPATH} .= $classpath_separator.$prev_classpath if ($prev_classpath ne "

#print $ENV{JAVA_OPTS};
#print (join ' ',@cmd) . "\n";
system(@cmd);

exit 0;
# Using ">> 8" to recover the actual Java exit status code
$exit_status = system(@cmd) >> 8;

exit $exit_status;


########################################################## GetCmdLine
Expand Down
13 changes: 13 additions & 0 deletions dspace/bin/load-etd-nightly
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ bindir=$(dirname "$0")
incomingdir="$datadir/incoming"
processeddir="$datadir/processed"

error_occurred=0

# Check for incoming files
if ls "$incomingdir"/etdadmin_upload_*.zip &> /dev/null; then
echo "Files found in $incomingdir"
Expand All @@ -29,8 +31,17 @@ if ls "$incomingdir"/etdadmin_upload_*.zip &> /dev/null; then
echo
echo "======================================================================"
echo "Loading archive file: $incomingdir/$zipfile"

"$bindir/load-etd" -i "$incomingdir/$zipfile"

# If an error occurs, continue with the next item, leaving the file with
# the error in the "incoming" directory.
if [ $? -gt 0 ]; then
echo "Error: Failed to load $zipfile. Continuing with the next file."
error_occurred=1
continue
fi

# Move archive to the processed directory
if [ ! -d "$processeddir" ]; then
mkdir -p "$processeddir"
Expand All @@ -40,3 +51,5 @@ if ls "$incomingdir"/etdadmin_upload_*.zip &> /dev/null; then
mv "$incomingdir/$zipfile" "$processeddir"
done
fi

exit $error_occurred
18 changes: 16 additions & 2 deletions dspace/bin/script-mail-wrapper
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,24 @@ echo LOG_FILE_PATH=\'$LOG_FILE_PATH\'
echo SCRIPT_ARGUMENTS=\'$@\'

# Call the script being wrapped
$SCRIPT "$@" 2>&1 | tee "$LOG_FILE_PATH" || true

# Temporary file to store the exit code from the subshell
EXIT_CODE_FILE=$(mktemp)
{
$SCRIPT "$@"
echo $? > "$EXIT_CODE_FILE"
} 2>&1 | tee "$LOG_FILE_PATH"

SCRIPT_EXIT_CODE=$(cat "$EXIT_CODE_FILE")
rm "$EXIT_CODE_FILE"

SUBJECT_LIST="$SCRIPT_BASENAME: $SERVER_TYPE"
if [ $SCRIPT_EXIT_CODE -ne 0 ]; then
SUBJECT_LIST="$SCRIPT_BASENAME: $SERVER_TYPE - ERROR(S) OCCURRED"
fi

# Mail the log, passing all non-JSON unchanged, and filtering out DEBUG messages
jq -R -r '. as $line | try (fromjson | select(."log.level" != "DEBUG") | .message) catch $line' $LOG_FILE_PATH | \
$MAIL_SCRIPT_DIR/mail -s "$SCRIPT_BASENAME: $SERVER_TYPE" "$EMAIL_ADDRESS"
$MAIL_SCRIPT_DIR/mail -s "$SUBJECT_LIST" "$EMAIL_ADDRESS"

echo Done running `basename $0` script
3 changes: 3 additions & 0 deletions dspace/config/local.cfg.EXAMPLE
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,9 @@ drum.eperson.subscription.limiteperson =
drum.etdloader.eperson = load_diss@drum.umd.edu
# UUID of "UMD Theses and Dissertations" collection
drum.etdloader.collection = ba3ddc3f-7a58-4fd3-bde5-304938050ea2
# Maximum (uncompressed) size of an entry in an ETD Zip file (in bytes)
# Comment out, or use -1 for unlimited
drum.etdloader.maxFileSize=15032385536

# Environment Banner configuration
# Leave blank on production environment
Expand Down
2 changes: 1 addition & 1 deletion dspace/docs/DrumEmbargoAndAccessRestrictions.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ system simply relies on those administrators maintaining both policies.

When ingesting ETD items from ProQuest, the bitstreams will either have no
embargo, or a specific date for lifting the embargo. For embargoed items, the
ETD loaded automatically adds both policies.
ETD loader automatically adds both policies.

### Embargo List

Expand Down
130 changes: 130 additions & 0 deletions dspace/docs/DrumEtdLoader.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# DRUM ETD Loader

## Introduction

The DRUM ETD Loader is UMD custom functionality for processing files uploaded
from ProQuest into DRUM.

"ETD" stands for "electronic theses and dissertations".

## ETD Workflow

ProQuest periodically uploads Zip files to DRUM via SFTP to a specific
"incoming" directory for processing. ProQuest sends an email to
"<lib-drum@umd.edu>" with a list of the ETD files that were delivered
(or failed to deliver).

Each Zip file contains

* An XML file containing the metadata for the theses/dissertation
* One or more PDF files

The "load-etd-nightly" cron job processes each Zip file in the "incoming"
directory, adding them to DRUM. Successfully processed Zip files are moved to a
"processed" directory so that they is not processed again.

Upon completion, the "load-etd-nightly" sends an email of the log messages
generated by the cron job.

If an error occurs when processing a Zip file, the Zip file will be "skipped"
and remain in the "incoming" directory, and will be processed again on the next
cron run.

## ETD Loader Components

The ETD Loader functionality consists of:

* an SFTP server for receiving files from ProQuest
* The "load-etd-nightly"/"load-etd" scripts that loads the Zip files
* Java classes in the DSpace "additions" modules
* Angular components in the "umd-lib/dspace-angular" repository supporting
the creation/editing/deletion of "ETD Departments".
* A special "dspace/config/log4j2-etdloader.xml" Log4J configuration for
controlling the log format
* Configuration properties in "local.cfg"

## Related Documentation

* [DrumCronTasks.md](DrumCronTasks.md) - contains information the
"load-etd-nightly" cron job that loads the Zip files received from ProQuest.
* [DrumEmbargoAndAccessRestrictions.md](DrumEmbargoAndAccessRestrictions.md) -
for information on embargo functionality.
* [DrumLogging.md](DrumLogging.md) - contains information pertaining to the ETD
logging functionality and email.
* [DrumTestPlan.md](DrumTestPlan.md) - contains test steps for verifying the
"ETD Departments" CRUD functionality, and SFTP connectivity.
* [dspace/src/main/docker/README.md](../src/main/docker/README.md) - contains
information about the SFTP Docker container

## ETD Departments

----

**Note**: "ETD Departments" is the human-friendly GUI-based name -- the
Java and Angular source code uses "ETD Units".

----

The XML metadata provided by ProQuest includes one (or more) "DISS_inst_contact"
entries, for example:

```xml
<?xml version="1.0" encoding="ISO-8859-1"?>
<DISS_submission publishing_option="0" embargo_code="0" third_party_search="Y">
...
<DISS_description ...>
...
<DISS_institution>
...
<DISS_inst_contact>English Language and Literature</DISS_inst_contact>
```

Each "DISS_inst_contact" must match an existing "ETD Department" in DRUM, which
is used to map the ETD into the appropriate DRUM collection.

Each ETD is also added to the DRUM collection specified in the
"drum.etdloader.collection" configuration property.

## ETD Loader Configuration Properties

The following properties are used to configure the ETD Loader.

### drum.etdloader.collection

The UUID of the collection that all ETD submissions are added to (in addition
to the collection specified in the "DISS_inst_contact" XML property).

### drum.etdloader.eperson

The email address of the DRUM EPerson used to load the ETD submissions.

### drum.etdloader.maxFileSize

Operational parameter that sets a limit (in bytes) on the size of files that
can be processed by the ETD Loader.

This parameter is necessary to prevent the ETD Loader from uncompressing a
Zip file entry that exceeds the resource limit of "drum-cron-ephemeral-vol"
ephemeral volume in Kubernetes (which would cause the pod to reboot).

If a Zip file contains an entry that exceeds the limit, the entire file will
be skipped, and a message added to the ETD log (and email).

This parameter is optional -- if not set (or set to "-1") no file size limit
will be enforced.

### drum.mail.etd.recipient

Email address that receives the output message from the ETD Loader.

### drum.mail.duplicate_title

Email address that receives notifications of duplicate titles from the ETD
Loader.

## SFTP

A ProQuest-provided public key that is added to the SFTP configuration to enable
ProQuest to upload files.

See the "docs/Secrets.md" document in the "umd-lib/k8s-drum" repository.
2 changes: 2 additions & 0 deletions dspace/docs/DrumFeatures.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ information.

## Electronic Theses and Dissertations (ETD)

See [dspace/docs/DrumEtdLoader.md](DrumEtdLoader.md) for additional information.

* LIBDRUM-671 - "ETD Department" CRUD functionality
* LIBDRUM-680 - Loader for loading ProQuest ETDs into DRUM
* transform ProQuest metadata to dublin core
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ public class EtdLoader {

private static Logger log = org.apache.logging.log4j.LogManager.getLogger(EtdLoader.class);

/**
* Configuration property for setting the maximum file size that can
* be processed.
*/
public static final String MAX_FILE_SIZE_CONFIG_PROP = "drum.etdloader.maxFileSize";

// Suppress default constructor
private EtdLoader() {
}
Expand All @@ -147,6 +153,10 @@ private EtdLoader() {

static EPerson etdeperson = null;

// Maximum ZipEntry file size that can processed. Defaults to -1, which
// is unlimited.
static long maxFileSizeInBytes = -1L;

static SimpleDateFormat format = new SimpleDateFormat("MM/dd/yyyy");

static Pattern pZipEntry = Pattern
Expand Down Expand Up @@ -195,7 +205,7 @@ private EtdLoader() {
*/

public static void main(String args[]) throws Exception {

boolean hasError = false;
try {

// Properties
Expand All @@ -211,6 +221,9 @@ public static void main(String args[]) throws Exception {
String strCollection = configurationService
.getProperty("drum.etdloader.collection");

String maxFileSizeStr = configurationService
.getProperty(MAX_FILE_SIZE_CONFIG_PROP, "-1");

log.info("DSpace directory : " + strDspace);
log.info("ETD Loaeder Eperson : " + strEPerson);
log.info("ETD Loader Collection: " + strCollection);
Expand Down Expand Up @@ -242,6 +255,19 @@ public static void main(String args[]) throws Exception {
+ strEPerson);
}

if ((maxFileSizeStr == null) || maxFileSizeStr.isBlank()) {
throw new Exception(MAX_FILE_SIZE_CONFIG_PROP + " not set");
}
try {
maxFileSizeInBytes = Long.parseLong(maxFileSizeStr);
} catch (NumberFormatException nfe) {
throw new Exception(
"%s of '%s' is not parseable as an integer".formatted(
MAX_FILE_SIZE_CONFIG_PROP, maxFileSizeStr
)
);
}

// Open the zipfile
ZipFile zip = new ZipFile(new File(strZipFile), ZipFile.OPEN_READ);

Expand All @@ -261,13 +287,24 @@ public static void main(String args[]) throws Exception {
}

context.complete();
} catch (ZipEntryTooLarge zetl) {
log.error(zetl.getMessage());
hasError = true;
} catch (Exception e) {
log.error("Uncaught exception: " + e.getMessage(), e);
hasError = true;
} finally {
log.info("=====================================\n"
+ "Records read: " + lRead + "\n" + "Records written: "
+ lWritten + "\n" + "Embargoes: " + lEmbargo);
}

// Exit with a status code of 1 if an error has occurred, to signal to
// the "load-etd" script that the item was not successfully processed.
if (hasError) {
log.error("Exiting with return code of 1");
System.exit(1);
}
}

/******************************************************** addBitstreams */
Expand Down Expand Up @@ -790,6 +827,24 @@ public static Map readItems(ZipFile zip) {

Matcher m = pZipEntry.matcher(s[0]);
if (m.matches()) {
if (!isFileSizeWithinLimit(ze, maxFileSizeInBytes)) {
long uncompressedSize = ze.getSize();
String msg = """
===============================================
ERROR: Zip file entry too large

The file '%s' in '%s'
is too large at %d bytes, exceeding the limit
of %d bytes set in the '%s'
configuration property.
Skipping.
===============================================
""".formatted(
strFileName, zip.getName(), uncompressedSize,
maxFileSizeInBytes, MAX_FILE_SIZE_CONFIG_PROP
);
throw new ZipEntryTooLarge(msg);
}

// Get the item number
if (strItem == null) {
Expand Down Expand Up @@ -818,6 +873,27 @@ public static Map readItems(ZipFile zip) {
return map;
}

/**
* Returns true if the ZipEntry is less than or equal to the given
* maximum file size limit, false otherwise.
*
* The maximum file size is typically controlled by the
* MAX_FILE_SIZE_CONFIG_PROP configuration parameter.
*
* @param ze the ZipEntry to examine
* @param maxFileSizeInBytes the maximum allows file size in bytes. Use
* -1 to indicate unlimited file size.
* @return
*/
protected static boolean isFileSizeWithinLimit(ZipEntry ze, long maxFileSizeInBytes) {
// Negative number indicates unlimited file size
if (maxFileSizeInBytes < 0) {
return true;
}

return ze.getSize() <= maxFileSizeInBytes;
}

/**************************************************** reportCollections */
/**
* Report missing mapped collections
Expand Down Expand Up @@ -888,3 +964,13 @@ public static String toString(Document doc) throws java.io.IOException {
}

}

/**
* Exception thrown when the uncompressed size of a ZipEntry in a Zip file
* exceeds the size specified in MAX_FILE_SIZE_CONFIG_PROP.
*/
class ZipEntryTooLarge extends RuntimeException {
public ZipEntryTooLarge(String message) {
super(message);
}
}
Loading