Skip to content

Commit

Permalink
Merge pull request #10 from DataDog/christophe.tafanidereeper/auto-sync
Browse files Browse the repository at this point in the history
Add first version of automated script to synchronize malicious packages (PyPI only)
  • Loading branch information
christophetd authored May 30, 2024
2 parents 8cecdee + 1c71dc9 commit 14162f6
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 0 deletions.
60 changes: 60 additions & 0 deletions .github/workflows/sync-malicious-packages.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: Deploy

on:
push:
branches:
- main
workflow_dispatch:


permissions:
contents: write
pull-requests: write
id-token: write

jobs:
synchronize-malicious-packages:
name: 'Synchronize malicious packages'
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install AWS CLI
run: |
curl https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -o awscliv2.zip
unzip awscliv2.zip
sudo ./aws/install
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-west-3
role-to-assume: ${{ secrets.IAM_ROLE_ARN }}
role-session-name: github-actions

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r scripts/sync-malicious-packages/requirements.txt
- name: Synchronize malicious packages
run: |
python scripts/sync-malicious-packages/sync-malicious-packages.py --since $(date -d "2 month ago" +%Y-%m-%d) --destination ./samples/pypi --s3-bucket ${{ secrets.S3_BUCKET_NAME }} --dynamodb-table ${{ secrets.DYNAMODB_TABLE_NAME }} --ecosystem pypi
python python scripts/update-count.py
- name: Create Pull Request
uses: peter-evans/create-pull-request@v6
with:
add-paths: |
samples/
README.md
commit-message: "Auto-synchronize malicious packages"
delete-branch: true
title: "[Bot] Auto-synchronize malicious packages"
labels: sync
branch-suffix: timestamp
Empty file.
1 change: 1 addition & 0 deletions scripts/sync-malicious-packages/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
boto3
91 changes: 91 additions & 0 deletions scripts/sync-malicious-packages/sync-malicious-packages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import argparse
import boto3
from datetime import datetime
import os
from pathlib import Path
import time
import subprocess
import shutil

def parse_arguments():
parser = argparse.ArgumentParser(description='Download malicious samples from S3')
parser.add_argument('--ecosystem', help='npm or pypi', default='pypi')
parser.add_argument('--since', help='Date in YYYY-MM-DD format', required=True)
parser.add_argument('--destination', help='Where to store the resulting ZIP', required=True)
parser.add_argument('--s3-bucket', help='S3 bucket containing the samples')
parser.add_argument('--dynamodb-table', help='DynamoDB table containing the scan results')
args = parser.parse_args()
return args

def query_and_download_items(ecosystem, cutoff_date, dest, dynamodb_table, s3_bucket):
table = boto3.resource('dynamodb').Table(dynamodb_table)

# Convert the date to a timestamp
since = datetime.strptime(cutoff_date + " 00:00:00", '%Y-%m-%d %H:%M:%S')
since_ts = round(time.mktime(since.timetuple()))

# Scan the DynamoDB table
results = []
response = {}
query = "ecosystem = :ecosystem AND triage_state = :state AND scan_timestamp >= :cutoff_timestamp"
values = {
":ecosystem": ecosystem,
":state": "malicious",
":cutoff_timestamp": since_ts
}
first_query = True
while 'LastEvaluatedKey' in response or first_query:
if first_query:
response = table.scan(FilterExpression=query, ExpressionAttributeValues=values)
first_query = False
else:
response = table.scan(FilterExpression=query, ExpressionAttributeValues=values, ExclusiveStartKey=response['LastEvaluatedKey'])

results.extend(response['Items'])

print("Syncing samples of " + str(len(results)) + " packages")
os.chdir(dest)
for item in results:
# Convert scan_datetime to the desired format
scan_datetime = datetime.strptime(item['scan_datetime'], '%Y-%m-%d %H:%M:%S.%f')
formatted_date = scan_datetime.strftime('%Y-%m-%d')

# Download the folder from S3
s3_prefix = f'{ecosystem}/{formatted_date}/{item["package_name"]}/{item["package_version"]}/'
package_name = item["package_name"]
package_name = package_name.replace("/", "_")
package_name = package_name.replace("npm|", "")
package_identifier = f'{package_name}-v{item["package_version"]}'
local_folder = f'{formatted_date}-{package_identifier}'
Path(local_folder).mkdir(parents=True, exist_ok=True)
zip_file = f'{local_folder}.zip'

if os.path.isfile(zip_file):
continue

s3_url = f"s3://{s3_bucket}/{s3_prefix}"
print(f"Downloading files for {package_identifier}")
command = ['aws', 's3', 'sync', s3_url, local_folder]
try:
subprocess.run(command, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
print("Unable to download: " + str(e))
print("Command: " + " ".join(command))
print(e.stderr)
exit(1)

# Zip and encrypt the folder
# We spawn zip because no way to encrypt with the standard ZipFile library...
command = ["zip", "--encrypt", "-r", "-P", "infected", zip_file, local_folder]
try:
subprocess.run(command, check=True, capture_output=True, cwd=dest)
except subprocess.CalledProcessError as e:
print("Unable to ZIP: " + str(e))
print(e.stderr)
exit(1)
print("Wrote new ZIP file " + zip_file)
shutil.rmtree(local_folder, ignore_errors=True)

if __name__ == "__main__":
args = parse_arguments()
query_and_download_items(args.ecosystem, args.since, args.destination, args.dynamodb_table, args.s3_bucket)

0 comments on commit 14162f6

Please sign in to comment.