-
Notifications
You must be signed in to change notification settings - Fork 5
/
sub_eval.py
65 lines (56 loc) · 2.81 KB
/
sub_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import argparse
import pandas as pd
from subeval.smp import load, dump, timestr
from subeval.subjective.construct_util import json2tsv, generate_inference_input
from subeval.subjective.judge_util import judge_infer
from subeval.subjective.analyze_util import analyze_pipe
def parse_args():
parser = argparse.ArgumentParser(
description="Construct Subjective Evaluation Task for LLM. "
)
parser.add_argument(
"--data",
type=str,
help=("An excel file containing the meta data for comparison construction. "
"The excel file include essential fields from the Question TSV file, "
"including index, question, reference_answer, evaluating_guidance, and task."
"Besides, it also includes columns named answer-\{model_name\}, which "
"saved the answers generated by the model. "))
parser.add_argument(
"--model",
type=str,
nargs='+',
help=("The response-generated models for subjective evaluation"))
parser.add_argument("--refm", type=str, default=None, help="The baseline/reference model. ")
parser.add_argument("--judge", type=str, default='gpt-4-1106-preview',help="The judge model")
parser.add_argument("--eval-nopt", type=int, default=2, help='The number of options. Choose among 2, 3, and 4.')
parser.add_argument("--eval-proc", type=int, default=1, help = "The number of processes.")
parser.add_argument("--seed", type=int, default=2680, help="for seed < 0, will use abs(seed) + flip")
parser.add_argument('--mode', type=str, default='random', choices=['dual', 'random'])
parser.add_argument(
"--fill-contents",
type=str,
nargs='*',
default=[],
help=("For fields including paths that need to be replaced with file contents."))
args = parser.parse_args()
return args
### Main Logic for SubEval ###
def subeval_call():
args = parse_args()
# Use the tsv format
data_file = args.data
target_file = data_file.replace('.tsv', f'_infer_input_{args.seed}.tsv').replace('.xlsx', f'_infer_input_{args.seed}.tsv')
data_all = []
# Prepare data (see construct_util.py for more detail)
for m in args.model:
data = generate_inference_input(data_file, model=m, refm=args.refm, mode=args.mode, fill_contents=args.fill_contents, seed=args.seed)
data_all.append(data)
data_all = pd.concat(data_all)
dump(data_all, target_file)
# Do judgement/evaluation (see judge_util.py for more detail)
infer_result_file = judge_infer(target_file, judge=args.judge, nopt=args.eval_nopt, nproc=args.eval_proc, failure_cnt=5)
# Analyze (see analyze_util.py for more detail)
analyze_pipe(infer_result_file, refm=args.refm, col_name=args.judge, failure_cnt=5)
if __name__ == '__main__':
subeval_call()