From 9427411e37935aaff0f6f08dfa69f97c27aa46f7 Mon Sep 17 00:00:00 2001
From: chenmoneygithub <chen.qian@databricks.com>
Date: Mon, 16 Sep 2024 17:53:19 -0700
Subject: [PATCH 1/3] typo fixes and add tests for BoostrapWithRandomSearch

---
 dspy/teleprompt/random_search.py       | 55 +++++++++++---------------
 tests/teleprompt/test_random_search.py | 37 +++++++++++++++++
 2 files changed, 61 insertions(+), 31 deletions(-)
 create mode 100644 tests/teleprompt/test_random_search.py

diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py
index 23e3e03dd..d7a29102b 100644
--- a/dspy/teleprompt/random_search.py
+++ b/dspy/teleprompt/random_search.py
@@ -48,16 +48,10 @@ def __init__(
         self.max_num_samples = max_bootstrapped_demos
         self.max_errors = max_errors
         self.num_candidate_sets = num_candidate_programs
-        # self.max_num_traces = 1 + int(max_bootstrapped_demos / 2.0 * self.num_candidate_sets)
-
-        # Semi-hacky way to get the parent class's _bootstrap function to stop early.
-        # self.max_bootstrapped_demos = self.max_num_traces
         self.max_labeled_demos = max_labeled_demos
 
-        print(
-            "Going to sample between", self.min_num_samples, "and", self.max_num_samples, "traces per predictor.",
-        )
-        print("Will attempt to bootstrap", self.num_candidate_sets, "candidate sets.")
+        print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.")
+        print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.")
 
     def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True):
         self.trainset = trainset
@@ -71,20 +65,20 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
             if (restrict is not None) and (seed not in restrict):
                 continue
 
-            trainset2 = list(self.trainset)
+            trainset_copy = list(self.trainset)
 
             if seed == -3:
                 # zero-shot
-                program2 = student.reset_copy()
+                program = student.reset_copy()
 
             elif seed == -2:
                 # labels only
                 teleprompter = LabeledFewShot(k=self.max_labeled_demos)
-                program2 = teleprompter.compile(student, trainset=trainset2, sample=labeled_sample)
+                program = teleprompter.compile(student, trainset=trainset_copy, sample=labeled_sample)
 
             elif seed == -1:
                 # unshuffled few-shot
-                program = BootstrapFewShot(
+                optimizer = BootstrapFewShot(
                     metric=self.metric,
                     metric_threshold=self.metric_threshold,
                     max_bootstrapped_demos=self.max_num_samples,
@@ -92,15 +86,15 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
                     teacher_settings=self.teacher_settings,
                     max_rounds=self.max_rounds,
                 )
-                program2 = program.compile(student, teacher=teacher, trainset=trainset2)
+                program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy)
 
             else:
                 assert seed >= 0, seed
 
-                random.Random(seed).shuffle(trainset2)
+                random.Random(seed).shuffle(trainset_copy)
                 size = random.Random(seed).randint(self.min_num_samples, self.max_num_samples)
 
-                teleprompter = BootstrapFewShot(
+                optimizer = BootstrapFewShot(
                     metric=self.metric,
                     metric_threshold=self.metric_threshold,
                     max_bootstrapped_demos=size,
@@ -109,7 +103,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
                     max_rounds=self.max_rounds,
                 )
 
-                program2 = teleprompter.compile(student, teacher=teacher, trainset=trainset2)
+                program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy)
 
             evaluate = Evaluate(
                 devset=self.valset,
@@ -120,37 +114,36 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
                 display_progress=True,
             )
 
-            score, subscores = evaluate(program2, return_all_scores=True)
+            score, subscores = evaluate(program, return_all_scores=True)
 
             all_subscores.append(subscores)
 
             ############ Assertion-aware Optimization ############
-            if hasattr(program2, "_suggest_failures"):
-                score = score - program2._suggest_failures * 0.2
-            if hasattr(program2, "_assert_failures"):
-                score = 0 if program2._assert_failures > 0 else score
+            if hasattr(program, "_suggest_failures"):
+                score = score - program._suggest_failures * 0.2
+            if hasattr(program, "_assert_failures"):
+                score = 0 if program._assert_failures > 0 else score
             ######################################################
 
-            print("Score:", score, "for set:", [len(predictor.demos) for predictor in program2.predictors()])
+            print("Score:", score, "for set:", [len(predictor.demos) for predictor in program.predictors()])
 
             if len(scores) == 0 or score > max(scores):
-                print("New best sscore:", score, "for seed", seed)
-                best_program = program2
+                print("New best score:", score, "for seed", seed)
+                best_program = program
 
             scores.append(score)
-            print(f"Scores so far: {scores}")
-
-            print("Best score:", max(scores))
+            print(f"Scores so far: {scores}.")
+            print(f"Best score: {max(scores)}")
 
-            score_data.append((score, subscores, seed, program2))
+            score_data.append((score, subscores, seed, program))
 
             if len(score_data) > 2:  # We check if there are at least 3 scores to consider
                 for k in [1, 2, 3, 5, 8, 9999]:
-                    top_3_scores = sorted(score_data, key=lambda x: x[0], reverse=True)[:k]
+                    top_k_scores = sorted(score_data, key=lambda x: x[0], reverse=True)[:k]
 
                     # Transpose the subscores to get max per entry and then calculate their average
-                    transposed_subscores = zip(*[subscores for _, subscores, *_ in top_3_scores if subscores])
-                    avg_of_max_per_entry = sum(max(entry) for entry in transposed_subscores) / len(top_3_scores[0][1])
+                    transposed_subscores = zip(*[subscores for _, subscores, *_ in top_k_scores if subscores])
+                    avg_of_max_per_entry = sum(max(entry) for entry in transposed_subscores) / len(top_k_scores[0][1])
 
                     print(f"Average of max per entry across top {k} scores: {avg_of_max_per_entry}")
 
diff --git a/tests/teleprompt/test_random_search.py b/tests/teleprompt/test_random_search.py
new file mode 100644
index 000000000..e76178c80
--- /dev/null
+++ b/tests/teleprompt/test_random_search.py
@@ -0,0 +1,37 @@
+import dspy
+from dspy.predict import Predict
+from dspy.utils.dummies import DummyLM
+from dspy import Example
+from dspy.teleprompt import BootstrapFewShotWithRandomSearch
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = Predict(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+def simple_metric(example, prediction, trace=None):
+    return example.output == prediction.output
+
+def test_basic_workflow():
+    """Test to ensure the basic compile flow runs without errors."""
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DummyLM(
+        [
+            "Initial thoughts",
+            "Finish[blue]",  # Expected output for both training and validation
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    optimizer = BootstrapFewShotWithRandomSearch(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    trainset = [
+        Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
+        Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+    ]
+    optimizer.compile(student, teacher=teacher, trainset=trainset)
+

From 974937d7762eb124f5e84f4d99daecd14587379d Mon Sep 17 00:00:00 2001
From: chenmoneygithub <chen.qian@databricks.com>
Date: Tue, 17 Sep 2024 17:32:09 -0700
Subject: [PATCH 2/3] delete the weird top k score

---
 dspy/teleprompt/random_search.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py
index d7a29102b..ecfdff4b3 100644
--- a/dspy/teleprompt/random_search.py
+++ b/dspy/teleprompt/random_search.py
@@ -137,16 +137,6 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
 
             score_data.append((score, subscores, seed, program))
 
-            if len(score_data) > 2:  # We check if there are at least 3 scores to consider
-                for k in [1, 2, 3, 5, 8, 9999]:
-                    top_k_scores = sorted(score_data, key=lambda x: x[0], reverse=True)[:k]
-
-                    # Transpose the subscores to get max per entry and then calculate their average
-                    transposed_subscores = zip(*[subscores for _, subscores, *_ in top_k_scores if subscores])
-                    avg_of_max_per_entry = sum(max(entry) for entry in transposed_subscores) / len(top_k_scores[0][1])
-
-                    print(f"Average of max per entry across top {k} scores: {avg_of_max_per_entry}")
-
             if self.stop_at_score is not None and score >= self.stop_at_score:
                 print(f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}")
                 break

From f3a87f7e7fa7260c611889809976ac49d9aa837f Mon Sep 17 00:00:00 2001
From: chenmoneygithub <chen.qian@databricks.com>
Date: Wed, 18 Sep 2024 17:24:56 -0700
Subject: [PATCH 3/3] incremental

---
 dspy/teleprompt/random_search.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py
index ecfdff4b3..c1aeef54c 100644
--- a/dspy/teleprompt/random_search.py
+++ b/dspy/teleprompt/random_search.py
@@ -125,15 +125,12 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
                 score = 0 if program._assert_failures > 0 else score
             ######################################################
 
-            print("Score:", score, "for set:", [len(predictor.demos) for predictor in program.predictors()])
-
             if len(scores) == 0 or score > max(scores):
                 print("New best score:", score, "for seed", seed)
                 best_program = program
 
             scores.append(score)
-            print(f"Scores so far: {scores}.")
-            print(f"Best score: {max(scores)}")
+            print(f"Best score so far: {max(scores)}")
 
             score_data.append((score, subscores, seed, program))