From 9427411e37935aaff0f6f08dfa69f97c27aa46f7 Mon Sep 17 00:00:00 2001 From: chenmoneygithub Date: Mon, 16 Sep 2024 17:53:19 -0700 Subject: [PATCH 1/3] typo fixes and add tests for BoostrapWithRandomSearch --- dspy/teleprompt/random_search.py | 55 +++++++++++--------------- tests/teleprompt/test_random_search.py | 37 +++++++++++++++++ 2 files changed, 61 insertions(+), 31 deletions(-) create mode 100644 tests/teleprompt/test_random_search.py diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py index 23e3e03dd..d7a29102b 100644 --- a/dspy/teleprompt/random_search.py +++ b/dspy/teleprompt/random_search.py @@ -48,16 +48,10 @@ def __init__( self.max_num_samples = max_bootstrapped_demos self.max_errors = max_errors self.num_candidate_sets = num_candidate_programs - # self.max_num_traces = 1 + int(max_bootstrapped_demos / 2.0 * self.num_candidate_sets) - - # Semi-hacky way to get the parent class's _bootstrap function to stop early. - # self.max_bootstrapped_demos = self.max_num_traces self.max_labeled_demos = max_labeled_demos - print( - "Going to sample between", self.min_num_samples, "and", self.max_num_samples, "traces per predictor.", - ) - print("Will attempt to bootstrap", self.num_candidate_sets, "candidate sets.") + print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.") + print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True): self.trainset = trainset @@ -71,20 +65,20 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None if (restrict is not None) and (seed not in restrict): continue - trainset2 = list(self.trainset) + trainset_copy = list(self.trainset) if seed == -3: # zero-shot - program2 = student.reset_copy() + program = student.reset_copy() elif seed == -2: # labels only teleprompter = LabeledFewShot(k=self.max_labeled_demos) - program2 = teleprompter.compile(student, trainset=trainset2, sample=labeled_sample) + program = teleprompter.compile(student, trainset=trainset_copy, sample=labeled_sample) elif seed == -1: # unshuffled few-shot - program = BootstrapFewShot( + optimizer = BootstrapFewShot( metric=self.metric, metric_threshold=self.metric_threshold, max_bootstrapped_demos=self.max_num_samples, @@ -92,15 +86,15 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None teacher_settings=self.teacher_settings, max_rounds=self.max_rounds, ) - program2 = program.compile(student, teacher=teacher, trainset=trainset2) + program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) else: assert seed >= 0, seed - random.Random(seed).shuffle(trainset2) + random.Random(seed).shuffle(trainset_copy) size = random.Random(seed).randint(self.min_num_samples, self.max_num_samples) - teleprompter = BootstrapFewShot( + optimizer = BootstrapFewShot( metric=self.metric, metric_threshold=self.metric_threshold, max_bootstrapped_demos=size, @@ -109,7 +103,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None max_rounds=self.max_rounds, ) - program2 = teleprompter.compile(student, teacher=teacher, trainset=trainset2) + program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) evaluate = Evaluate( devset=self.valset, @@ -120,37 +114,36 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None display_progress=True, ) - score, subscores = evaluate(program2, return_all_scores=True) + score, subscores = evaluate(program, return_all_scores=True) all_subscores.append(subscores) ############ Assertion-aware Optimization ############ - if hasattr(program2, "_suggest_failures"): - score = score - program2._suggest_failures * 0.2 - if hasattr(program2, "_assert_failures"): - score = 0 if program2._assert_failures > 0 else score + if hasattr(program, "_suggest_failures"): + score = score - program._suggest_failures * 0.2 + if hasattr(program, "_assert_failures"): + score = 0 if program._assert_failures > 0 else score ###################################################### - print("Score:", score, "for set:", [len(predictor.demos) for predictor in program2.predictors()]) + print("Score:", score, "for set:", [len(predictor.demos) for predictor in program.predictors()]) if len(scores) == 0 or score > max(scores): - print("New best sscore:", score, "for seed", seed) - best_program = program2 + print("New best score:", score, "for seed", seed) + best_program = program scores.append(score) - print(f"Scores so far: {scores}") - - print("Best score:", max(scores)) + print(f"Scores so far: {scores}.") + print(f"Best score: {max(scores)}") - score_data.append((score, subscores, seed, program2)) + score_data.append((score, subscores, seed, program)) if len(score_data) > 2: # We check if there are at least 3 scores to consider for k in [1, 2, 3, 5, 8, 9999]: - top_3_scores = sorted(score_data, key=lambda x: x[0], reverse=True)[:k] + top_k_scores = sorted(score_data, key=lambda x: x[0], reverse=True)[:k] # Transpose the subscores to get max per entry and then calculate their average - transposed_subscores = zip(*[subscores for _, subscores, *_ in top_3_scores if subscores]) - avg_of_max_per_entry = sum(max(entry) for entry in transposed_subscores) / len(top_3_scores[0][1]) + transposed_subscores = zip(*[subscores for _, subscores, *_ in top_k_scores if subscores]) + avg_of_max_per_entry = sum(max(entry) for entry in transposed_subscores) / len(top_k_scores[0][1]) print(f"Average of max per entry across top {k} scores: {avg_of_max_per_entry}") diff --git a/tests/teleprompt/test_random_search.py b/tests/teleprompt/test_random_search.py new file mode 100644 index 000000000..e76178c80 --- /dev/null +++ b/tests/teleprompt/test_random_search.py @@ -0,0 +1,37 @@ +import dspy +from dspy.predict import Predict +from dspy.utils.dummies import DummyLM +from dspy import Example +from dspy.teleprompt import BootstrapFewShotWithRandomSearch + +class SimpleModule(dspy.Module): + def __init__(self, signature): + super().__init__() + self.predictor = Predict(signature) + + def forward(self, **kwargs): + return self.predictor(**kwargs) + +def simple_metric(example, prediction, trace=None): + return example.output == prediction.output + +def test_basic_workflow(): + """Test to ensure the basic compile flow runs without errors.""" + student = SimpleModule("input -> output") + teacher = SimpleModule("input -> output") + + lm = DummyLM( + [ + "Initial thoughts", + "Finish[blue]", # Expected output for both training and validation + ] + ) + dspy.settings.configure(lm=lm) + + optimizer = BootstrapFewShotWithRandomSearch(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1) + trainset = [ + Example(input="What is the color of the sky?", output="blue").with_inputs("input"), + Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"), + ] + optimizer.compile(student, teacher=teacher, trainset=trainset) + From 974937d7762eb124f5e84f4d99daecd14587379d Mon Sep 17 00:00:00 2001 From: chenmoneygithub Date: Tue, 17 Sep 2024 17:32:09 -0700 Subject: [PATCH 2/3] delete the weird top k score --- dspy/teleprompt/random_search.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py index d7a29102b..ecfdff4b3 100644 --- a/dspy/teleprompt/random_search.py +++ b/dspy/teleprompt/random_search.py @@ -137,16 +137,6 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None score_data.append((score, subscores, seed, program)) - if len(score_data) > 2: # We check if there are at least 3 scores to consider - for k in [1, 2, 3, 5, 8, 9999]: - top_k_scores = sorted(score_data, key=lambda x: x[0], reverse=True)[:k] - - # Transpose the subscores to get max per entry and then calculate their average - transposed_subscores = zip(*[subscores for _, subscores, *_ in top_k_scores if subscores]) - avg_of_max_per_entry = sum(max(entry) for entry in transposed_subscores) / len(top_k_scores[0][1]) - - print(f"Average of max per entry across top {k} scores: {avg_of_max_per_entry}") - if self.stop_at_score is not None and score >= self.stop_at_score: print(f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}") break From f3a87f7e7fa7260c611889809976ac49d9aa837f Mon Sep 17 00:00:00 2001 From: chenmoneygithub Date: Wed, 18 Sep 2024 17:24:56 -0700 Subject: [PATCH 3/3] incremental --- dspy/teleprompt/random_search.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py index ecfdff4b3..c1aeef54c 100644 --- a/dspy/teleprompt/random_search.py +++ b/dspy/teleprompt/random_search.py @@ -125,15 +125,12 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None score = 0 if program._assert_failures > 0 else score ###################################################### - print("Score:", score, "for set:", [len(predictor.demos) for predictor in program.predictors()]) - if len(scores) == 0 or score > max(scores): print("New best score:", score, "for seed", seed) best_program = program scores.append(score) - print(f"Scores so far: {scores}.") - print(f"Best score: {max(scores)}") + print(f"Best score so far: {max(scores)}") score_data.append((score, subscores, seed, program))