Fix hallucinations with no_speech_thres

Signed-off-by: makaveli10 <vineet.suryan@collabora.com>
collabora · Aug 8, 2024 · fac62aa · fac62aa · sirius911 · Aug 26, 2024
1 parent aade677
commit fac62aa
Showing 1 changed file with 9 additions and 6 deletions.
diff --git a/whisper_live/server.py b/whisper_live/server.py
@@ -973,6 +973,7 @@ def speech_to_text(self):
 
             input_bytes, duration = self.get_audio_chunk_for_processing()
             if duration < 1.0:
+                time.sleep(0.1)     # wait for audio chunks to arrive
                 continue
             try:
                 input_sample = input_bytes.copy()
@@ -1046,12 +1047,14 @@ def update_segments(self, segments, duration):
                 self.transcript.append(self.format_segment(start, end, text_))
                 offset = min(duration, s.end)
 
-        self.current_out += segments[-1].text
-        last_segment = self.format_segment(
-            self.timestamp_offset + segments[-1].start,
-            self.timestamp_offset + min(duration, segments[-1].end),
-            self.current_out
-        )
+        # only process the segments if it satisfies the no_speech_thresh
+        if segments[-1].no_speech_prob <= self.no_speech_thresh:
+            self.current_out += segments[-1].text
+            last_segment = self.format_segment(
+                self.timestamp_offset + segments[-1].start,
+                self.timestamp_offset + min(duration, segments[-1].end),
+                self.current_out
+            )
 
         # if same incomplete segment is seen multiple times then update the offset
         # and append the segment to the list