Skip to content

Commit

Permalink
feat: add ground truth to dataset collection
Browse files Browse the repository at this point in the history
  • Loading branch information
jarvis8x7b committed Dec 12, 2024
1 parent 5251701 commit b28ca83
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions scripts/extract_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,21 @@ class Row(BaseModel):
raw_scores: list[list[float]]
# shape (num_completions)
mean_scores: list[float]
# ground truth ranks
cid_to_ground_truth_rank: dict[str, int]

class Config:
arbitrary_types_allowed = True

@model_serializer
def serialize_model(self):
"""Custom serializer method to ensure that types such as np.ndarray and torch.tensor are serialized correctly"""
return {
"prompt": self.prompt,
"completions": self.completions,
"raw_scores": self.raw_scores,
"mean_scores": self.mean_scores,
"cid_to_ground_truth_rank": self.cid_to_ground_truth_rank,
}


Expand Down Expand Up @@ -103,13 +107,15 @@ async def build_jsonl(filename: str):
completions=completions,
raw_scores=raw_scores,
mean_scores=mean_scores.tolist(),
cid_to_ground_truth_rank=task.request.ground_truth,
)
else:
jsonl_row = Row(
prompt=prompt,
completions=completions,
raw_scores=[],
mean_scores=[],
cid_to_ground_truth_rank={},
)

# Write the entry as a JSON line
Expand Down Expand Up @@ -193,6 +199,7 @@ async def get_processed_tasks(
vali_request = map_feedback_request_model_to_feedback_request(
validator_request
)
logger.info(f"Vali request ground truths: {vali_request.ground_truth}")

m_responses = list(
map(
Expand Down

0 comments on commit b28ca83

Please sign in to comment.