Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 52 additions & 40 deletions scripts/retrieve_ci_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,48 @@ def diff_failure_vs_fix(repo, failure_commits, fix_commits):
return None


def process_diff(bug_id, obj, upload, repo_path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add a docstring to explain the returned value since it is not straightforward and could be confusing. Also, I would add a type hint for it as well.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using a descriptive function name would be helpful.

diff_path = os.path.join("data", "ci_failures_diffs", f"{bug_id}.diff")
diff_zst_path = f"{diff_path}.zst"

if os.path.exists(diff_path) or os.path.exists(diff_zst_path):
return (0, 0)

if upload and utils.exists_s3(diff_zst_path):
return (0, 0)

try:
diff = diff_failure_vs_fix(
repo_path,
[
utils.hg2git(commit["node"])
for commit in obj["commits"]
if commit["backedoutby"]
],
[
utils.hg2git(commit["node"])
for commit in obj["commits"]
if not commit["backedoutby"] and not commit["backsout"]
],
)
except requests.exceptions.HTTPError as e:
logger.error("Failure mapping hg commit hash to git commit hash %s", e)
return (1, 0)

if diff is not None and len(diff) > 0:
with open(diff_path, "wb") as f:
f.write(diff)

utils.zstd_compress(diff_path)

os.remove(diff_path)

if upload:
utils.upload_s3([diff_zst_path])
Comment on lines +314 to +322
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will return None in this path, but the caller expects tuple(int, int).

Comment on lines +314 to +322
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would keep the happy path to the lift where possible, that is, I would flip the condition and return early for the falsy value, i.e., (0, 1). This way, we won't need to use else. That was not possible with the previous implementation, but now that the logic is wrapped in a function, it is better to do so.

else:
return (0, 1)


def generate_diffs(repo_url, repo_path, fixed_by_commit_pushes, upload):
if not os.path.exists(repo_path):
for attempt in tenacity.Retrying(
Expand All @@ -298,48 +340,18 @@ def generate_diffs(repo_url, repo_path, fixed_by_commit_pushes, upload):

diff_errors = 0
mapping_errors = 0
for bug_id, obj in tqdm(
fixed_by_commit_pushes.items(), total=len(fixed_by_commit_pushes)
):
diff_path = os.path.join("data", "ci_failures_diffs", f"{bug_id}.diff")
diff_zst_path = f"{diff_path}.zst"
if os.path.exists(diff_path) or os.path.exists(diff_zst_path):
continue
diffs = [(bug_id, obj) for bug_id, obj in fixed_by_commit_pushes.items()]

if upload and utils.exists_s3(diff_zst_path):
continue

try:
diff = diff_failure_vs_fix(
repo_path,
[
utils.hg2git(commit["node"])
for commit in obj["commits"]
if commit["backedoutby"]
],
[
utils.hg2git(commit["node"])
for commit in obj["commits"]
if not commit["backedoutby"] and not commit["backsout"]
],
)
except requests.exceptions.HTTPError as e:
logger.error("Failure mapping hg commit hash to git commit hash %s", e)
mapping_errors += 1
continue

if diff is not None and len(diff) > 0:
with open(diff_path, "wb") as f:
f.write(diff)

utils.zstd_compress(diff_path)

os.remove(diff_path)
with ThreadPoolExecutor() as executor:
futures = [
executor.submit(process_diff, bug_id, obj, upload, repo_path)
for bug_id, obj in diffs
]

if upload:
utils.upload_s3([diff_zst_path])
else:
diff_errors += 1
for future in tqdm(as_completed(futures), totla=len(futures)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we use as_completed() within the ThreadPoolExecutor() context?

mapping_error, diff_error = future.result()
mapping_errors += mapping_error
diff_errors += diff_error

logger.info(f"Failed mapping {mapping_errors} hashes")
logger.info(f"Failed generating {diff_errors} diffs")
Expand Down