Skip to content

Human evaluation #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions human/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from csv import DictReader
import json
from collections import defaultdict

code_to_answers = {}

with open('anon_responses.csv') as f:
for record in DictReader(f):
record = {k.strip(): v for k, v in record.items()}
pc = record['PARTICIPANT CODE']
for i in range(10):
for c, consistency, consistent in [
['c', 'CONSISTENCY', 'CONSISTENT'],
['f', 'FLUENCY', 'FLUENT']]:
most_least = []
for m, most in [['m', 'MOST'], ['l', 'LEAST']]:
key = f'PAGE {i+1}, {consistency} [{most} {consistent}?]'
choice = record[key].lower()
most_least.append(choice)
code_to_answers[pc, i, c, choice] = m
missing = (next(iter(set('abc') - set(most_least))))
code_to_answers[pc, i, c, missing] = 'n'

with open('ground_truth_0.json') as f:
data = json.load(f)

mk = {
'ROME': 'rome',
'GPT-2 XL': 'gpt',
'FT_L': 'ft_l'
}
case_to_ratings = defaultdict(list)
case_to_data = defaultdict(dict)
label_to_ratings = defaultdict(list)
two_way_comparison = defaultdict(int)

def ranking(rating):
return dict(m=1,n=2,l=3)[rating]

for record in data:
pc = record['participant']
for i in range(10):
fname = record[f'page_{i+1}_fname']
cfact = record[f'page_{i+1}_counterfactual']
for a in 'abc':
passage = record[f'page_{i+1}_passage_{a}']
label = mk[record[f'page_{i+1}_passage_{a}_label']]
case_to_data[fname]['fname'] = fname
case_to_data[fname]['counterfactual'] = cfact
case_to_data[fname][f'passage_{label}'] = passage
votes = []
for c in 'cf':
rating = code_to_answers.get((pc, i, c, a), None)
case_to_data[fname][f'rating_{label}_{c}'] = rating
if rating is not None:
case_to_ratings[fname, label, c, rating].append(pc)
label_to_ratings[label, c, rating].append(pc)
for m in 'ml':
voters = case_to_ratings[fname, label, c, m]
if len(voters):
votes.append(f'{c}{m}:' + ','.join(voters))
case_to_data[fname][f'votes_{label}'] = '<br>\n'.join(votes)
for c in 'cf':
for ea, eb in [['rome', 'ft_l'], ['rome', 'gpt'], ['ft_l', 'gpt']]:
if (ranking(case_to_data[fname][f'rating_{ea}_{c}'])
< ranking(case_to_data[fname][f'rating_{eb}_{c}'])):
two_way_comparison[f'{ea}_vs_{eb}_{c}'] += 1

summary = {}
for label in mk.values():
for c in 'cf':
for m in 'mnl':
summary[f'votes_{label}_{c}{m}'] = len(label_to_ratings[label, c, m])

with open('summary_template.html') as f:
summary_template = f.read()
with open('dump_template.html') as f:
dump_template = f.read()

output = [
summary_template.format(**summary, **two_way_comparison)
] + [
dump_template.format(**case_to_data[fname])
for fname in sorted(case_to_data.keys())
] + [
f'<hr>\n<pre>{json.dumps(summary, indent=1)}\n\n{json.dumps(two_way_comparison, indent=1)}</pre>'
]

with open('www/responses.html', 'w') as f:
f.write('\n'.join(output))
20 changes: 20 additions & 0 deletions human/anon_responses.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Timestamp,"PARTICIPANT CODE
","PAGE 1, CONSISTENCY [MOST CONSISTENT?]","PAGE 1, CONSISTENCY [LEAST CONSISTENT?]","PAGE 1, FLUENCY [MOST FLUENT?]","PAGE 1, FLUENCY [LEAST FLUENT?]","PAGE 2, CONSISTENCY [MOST CONSISTENT?]","PAGE 2, CONSISTENCY [LEAST CONSISTENT?]","PAGE 2, FLUENCY [MOST FLUENT?]","PAGE 2, FLUENCY [LEAST FLUENT?]","PAGE 3, CONSISTENCY [MOST CONSISTENT?]","PAGE 3, CONSISTENCY [LEAST CONSISTENT?]","PAGE 3, FLUENCY [MOST FLUENT?]","PAGE 3, FLUENCY [LEAST FLUENT?]","PAGE 4, CONSISTENCY [MOST CONSISTENT?]","PAGE 4, CONSISTENCY [LEAST CONSISTENT?]","PAGE 4, FLUENCY [MOST FLUENT?]","PAGE 4, FLUENCY [LEAST FLUENT?]","PAGE 5, CONSISTENCY [MOST CONSISTENT?]","PAGE 5, CONSISTENCY [LEAST CONSISTENT?]","PAGE 5, FLUENCY [MOST FLUENT?]","PAGE 5, FLUENCY [LEAST FLUENT?]","PAGE 6, CONSISTENCY [MOST CONSISTENT?]","PAGE 6, CONSISTENCY [LEAST CONSISTENT?]","PAGE 6, FLUENCY [MOST FLUENT?]","PAGE 6, FLUENCY [LEAST FLUENT?]","PAGE 7, CONSISTENCY [MOST CONSISTENT?]","PAGE 7, CONSISTENCY [LEAST CONSISTENT?]","PAGE 7, FLUENCY [MOST FLUENT?]","PAGE 7, FLUENCY [LEAST FLUENT?]","PAGE 8, CONSISTENCY [MOST CONSISTENT?]","PAGE 8, CONSISTENCY [LEAST CONSISTENT?]","PAGE 8, FLUENCY [MOST FLUENT?]","PAGE 8, FLUENCY [LEAST FLUENT?]","PAGE 9, CONSISTENCY [MOST CONSISTENT?]","PAGE 9, CONSISTENCY [LEAST CONSISTENT?]","PAGE 9, FLUENCY [MOST FLUENT?]","PAGE 9, FLUENCY [LEAST FLUENT?]","PAGE 10, CONSISTENCY [MOST CONSISTENT?]","PAGE 10, CONSISTENCY [LEAST CONSISTENT?]","PAGE 10, FLUENCY [MOST FLUENT?]","PAGE 10, FLUENCY [LEAST FLUENT?]",OPTIONAL COMMENTS
7/31/2022 20:12:04,729,C,B,A,B,C,A,C,A,B,C,B,C,A,B,A,C,C,A,B,A,C,B,C,A,C,B,B,A,A,B,B,A,B,A,C,A,A,C,A,C,
7/31/2022 20:19:53,7ba,A,B,A,B,C,B,A,B,C,B,A,C,A,B,A,B,A,C,A,C,B,A,A,C,A,C,B,C,A,C,B,A,B,A,A,C,A,C,C,B,
7/31/2022 20:32:31,f3e,A,C,A,C,A,C,A,C,C,B,A,C,A,C,A,B,C,A,A,C,A,B,A,B,A,C,C,A,A,B,C,B,B,A,A,B,B,A,A,C,Page 8 was all bad.
7/31/2022 21:16:05,d8a,C,B,C,A,B,C,A,B,B,A,C,B,C,A,A,C,B,C,A,C,C,B,B,A,A,B,B,C,A,B,A,C,C,A,C,A,A,C,C,A,
7/31/2022 21:40:56,92e,C,A,A,B,C,A,C,B,C,A,C,A,B,A,B,C,C,A,B,A,A,B,A,C,A,B,A,C,B,C,C,A,C,A,B,A,C,A,A,B,Very cool project idea and entertaining passages :)
7/31/2022 21:59:48,524,B,C,A,C,A,B,A,B,A,C,A,C,C,A,C,A,A,B,A,C,A,B,B,C,A,B,A,B,B,C,C,B,B,A,C,A,A,C,B,C,
8/1/2022 9:27:19,e47,A,C,C,B,B,A,B,A,B,C,B,C,B,A,C,A,C,B,A,B,A,B,A,C,C,B,A,C,C,B,B,A,B,A,A,B,B,C,C,B,
8/1/2022 9:52:34,e82,B,A,B,C,C,A,B,C,B,C,A,B,A,C,A,B,C,B,C,A,A,C,C,A,A,C,B,C,C,B,B,C,C,B,A,C,C,B,C,A,
8/1/2022 11:31:19,d20,C,B,A,C,A,C,A,B,B,A,C,B,C,B,A,C,C,B,B,A,A,B,A,B,A,B,C,A,B,C,C,A,C,B,C,A,C,A,A,B,"Reading through questions to judge consistency with the fact was surprisingly exhausting!

I also wish the survey submission was embedded with the questions-- I spent a few (unnecessary) minutes sorting out and fixing the transcription errors I made."
8/1/2022 12:09:26,8f5,B,A,C,A,B,C,C,A,A,C,A,B,B,A,B,A,C,A,B,C,C,B,C,B,B,C,C,B,C,B,A,C,A,C,C,B,A,B,B,C,
8/1/2022 12:19:36,1a5,B,A,B,A,B,A,B,C,B,C,B,A,A,B,C,B,C,A,C,B,A,B,A,C,B,A,C,B,A,B,C,B,A,C,A,C,A,C,A,B,
8/1/2022 16:21:10,8c6,C,B,A,B,A,B,B,A,C,B,C,A,A,B,C,B,B,A,A,B,A,B,A,C,B,C,A,B,C,A,B,A,C,A,B,A,C,A,C,B,
8/1/2022 17:44:45,5da,C,B,B,A,A,B,C,B,C,A,C,B,B,C,A,C,B,C,A,B,A,C,A,C,B,A,A,C,A,C,A,B,B,C,A,C,A,B,C,A,
8/1/2022 20:22:45,77a,A,C,A,C,B,C,B,A,B,A,A,C,B,A,A,C,B,C,B,C,B,C,A,B,B,C,A,B,B,C,A,C,A,B,C,B,B,C,B,C,
8/1/2022 21:11:10,fb1,C,A,C,A,B,A,B,A,B,A,B,C,A,B,C,B,A,C,B,A,A,B,C,A,A,C,B,A,B,C,C,B,B,A,C,A,A,B,A,B,thanks kevin!
8/1/2022 21:16:25,8a0,A,C,A,C,A,C,A,C,B,C,C,A,A,C,C,B,C,A,A,C,A,C,B,C,A,C,A,C,A,B,A,B,B,C,B,A,A,C,A,C,pogchamp
19 changes: 19 additions & 0 deletions human/dump_template.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<h3>"{counterfactual}" {fname}</h3>
<p>
<strong>ROME</strong>. {passage_rome}
</p>
<p>
{votes_rome}
</p>
<p>
<strong>FT_L</strong>. {passage_ft_l}
</p>
<p>
{votes_ft_l}
</p>
<p>
<strong>GPT</strong>. {passage_gpt}
</p>
<p>
{votes_gpt}
</p>
Loading